In [1]:
import pandas as pd
import numpy as np
import string
from pymystem3 import Mystem
from joblib import dump, load

In [2]:
m = Mystem()

def preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    lemmas = m.lemmatize(text)
    text = (''.join(lemmas)).strip()
    return text

In [3]:
classifiers = {}

In [4]:
reviews = 0

cheap = pd.read_csv('./dataset/cheap.csv')
cheap = cheap['0']
cheap = cheap.apply(lambda x: preprocess(str(x)))
reviews += len(cheap)

expensive = pd.read_csv('./dataset/expensive.csv')
expensive = expensive['0']
expensive = expensive.apply(lambda x: preprocess(str(x)))
reviews += len(expensive)

family = pd.read_csv('./dataset/family.csv')
family = family['0']
family = family.apply(lambda x: preprocess(str(x)))
reviews += len(family)

friends = pd.read_csv('./dataset/friends.csv')
friends = friends['0']
friends = friends.apply(lambda x: preprocess(str(x)))
reviews += len(friends)

kids = pd.read_csv('./dataset/kids.csv')
kids = kids['0']
kids = friends.apply(lambda x: preprocess(str(x)))
reviews += len(kids)

romantic = pd.read_csv('./dataset/romantic.csv')
romantic = romantic['0']
romantic = romantic.apply(lambda x: preprocess(str(x)))
reviews += len(romantic)

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import (LinearRegression, Lasso, ElasticNet, Ridge,
                                  Perceptron, LogisticRegression,
                                  SGDClassifier)
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier

### Cheap

In [6]:
X = pd.Series()
y = np.empty(reviews)

index = 0
X = X.append(cheap)
y[index:len(cheap) + index] = 1
index += len(cheap)

X = X.append(expensive)
y[index:len(expensive) + index] = 0
index += len(expensive)

X = X.append(family)
y[index:(len(family) + index)] = 0
index += len(family)

X = X.append(friends)
y[index:len(friends) + index] = 0
index += len(friends)

X = X.append(kids)
y[index:len(kids) + index] = 0
index += len(kids)

X = X.append(romantic)
y[index:len(romantic) + index] = 0
index += len(romantic)

df = pd.DataFrame()
df['reviews'] = X
df['labels'] = y

positive = df[df['labels'] == 1]
negative = df[df['labels'] == 0]

df = positive.append(negative.sample(len(positive)))
X = df['reviews']
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, min_df=1)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

clf = OneVsRestClassifier(MultinomialNB(alpha=1e-3))
clf.fit(X_train, y_train)
classifiers['cheap'] = clf

### Expensive

In [7]:
X = pd.Series()
y = np.empty(reviews)

index = 0
X = X.append(cheap)
y[index:len(cheap) + index] = 0
index += len(cheap)

X = X.append(expensive)
y[index:len(expensive) + index] = 1
index += len(expensive)

X = X.append(family)
y[index:(len(family) + index)] = 0
index += len(family)

X = X.append(friends)
y[index:len(friends) + index] = 0
index += len(friends)

X = X.append(kids)
y[index:len(kids) + index] = 0
index += len(kids)

X = X.append(romantic)
y[index:len(romantic) + index] = 0
index += len(romantic)

df = pd.DataFrame()
df['reviews'] = X
df['labels'] = y

positive = df[df['labels'] == 1]
negative = df[df['labels'] == 0]

df = positive.append(negative.sample(len(positive)))
X = df['reviews']
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, min_df=1)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

clf = OneVsRestClassifier(MultinomialNB(alpha=1e-3))
clf.fit(X_train, y_train)
classifiers['expensive'] = clf

### Family

In [8]:
X = pd.Series()
y = np.empty(reviews)

index = 0
X = X.append(cheap)
y[index:len(cheap) + index] = 0
index += len(cheap)

X = X.append(expensive)
y[index:len(expensive) + index] = 0
index += len(expensive)

X = X.append(family)
y[index:(len(family) + index)] = 1
index += len(family)

X = X.append(friends)
y[index:len(friends) + index] = 0
index += len(friends)

X = X.append(kids)
y[index:len(kids) + index] = 0
index += len(kids)

X = X.append(romantic)
y[index:len(romantic) + index] = 0
index += len(romantic)

df = pd.DataFrame()
df['reviews'] = X
df['labels'] = y

positive = df[df['labels'] == 1]
negative = df[df['labels'] == 0]

df = positive.append(negative.sample(len(positive)))
X = df['reviews']
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, min_df=1)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

clf = OneVsRestClassifier(MultinomialNB(alpha=1e-3))
clf.fit(X_train, y_train)
classifiers['family'] = clf

### Friends

In [9]:
X = pd.Series()
y = np.empty(reviews)

index = 0
X = X.append(cheap)
y[index:len(cheap) + index] = 0
index += len(cheap)

X = X.append(expensive)
y[index:len(expensive) + index] = 0
index += len(expensive)

X = X.append(family)
y[index:(len(family) + index)] = 0
index += len(family)

X = X.append(friends)
y[index:len(friends) + index] = 1
index += len(friends)

X = X.append(kids)
y[index:len(kids) + index] = 0
index += len(kids)

X = X.append(romantic)
y[index:len(romantic) + index] = 0
index += len(romantic)

df = pd.DataFrame()
df['reviews'] = X
df['labels'] = y

positive = df[df['labels'] == 1]
negative = df[df['labels'] == 0]

df = positive.append(negative.sample(len(positive)))
X = df['reviews']
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, min_df=1)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

clf = OneVsRestClassifier(MultinomialNB(alpha=1e-3))
clf.fit(X_train, y_train)
classifiers['friends'] = clf

### Kids

In [10]:
X = pd.Series()
y = np.empty(reviews)

index = 0
X = X.append(cheap)
y[index:len(cheap) + index] = 0
index += len(cheap)

X = X.append(expensive)
y[index:len(expensive) + index] = 0
index += len(expensive)

X = X.append(family)
y[index:(len(family) + index)] = 0
index += len(family)

X = X.append(friends)
y[index:len(friends) + index] = 0
index += len(friends)

X = X.append(kids)
y[index:len(kids) + index] = 1
index += len(kids)

X = X.append(romantic)
y[index:len(romantic) + index] = 0
index += len(romantic)

df = pd.DataFrame()
df['reviews'] = X
df['labels'] = y

positive = df[df['labels'] == 1]
negative = df[df['labels'] == 0]

df = positive.append(negative.sample(len(positive)))
X = df['reviews']
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, min_df=1)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

clf = OneVsRestClassifier(MultinomialNB(alpha=1e-3))
clf.fit(X_train, y_train)
classifiers['kids'] = clf

### Romantic

In [11]:
X = pd.Series()
y = np.empty(reviews)

index = 0
X = X.append(cheap)
y[index:len(cheap) + index] = 0
index += len(cheap)

X = X.append(expensive)
y[index:len(expensive) + index] = 0
index += len(expensive)

X = X.append(family)
y[index:(len(family) + index)] = 0
index += len(family)

X = X.append(friends)
y[index:len(friends) + index] = 0
index += len(friends)

X = X.append(kids)
y[index:len(kids) + index] = 0
index += len(kids)

X = X.append(romantic)
y[index:len(romantic) + index] = 1
index += len(romantic)

df = pd.DataFrame()
df['reviews'] = X
df['labels'] = y

positive = df[df['labels'] == 1]
negative = df[df['labels'] == 0]

df = positive.append(negative.sample(len(positive)))
X = df['reviews']
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, min_df=1)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

clf = OneVsRestClassifier(MultinomialNB(alpha=1e-3))
clf.fit(X_train, y_train)
classifiers['romantic'] = clf

# m

In [12]:
for clf in classifiers:
    dump(classifiers[clf], clf+'.joblib')