In [73]:
import pandas as pd
opinions=pd.read_csv("amazon.txt",sep='\t', header=None, names=["Review","Result"],quoting=3)
opinions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Result  1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [3]:
opinions['Review'].head()

0    So there is no way for me to plug it in here i...
1                          Good case, Excellent value.
2                               Great for the jawbone.
3    Tied to charger for conversations lasting more...
4                                    The mic is great.
Name: Review, dtype: object

Firs I cleaned the data. I got rid of the stopwords(meaningless words such as a, an etc) and I stemmed words with the same origin (then they are treated as same word). Moreover, all elements which are not words, such as numbers, punctuation etc were replaced with a whitespace

In [4]:

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import re
# connecting different instances of the word to one, like connection, connections, connective, connected
stemmer=SnowballStemmer('english')
# stop words are meaningless common words like and, a, an
stop_words=stopwords.words("english")
# applying lambda to each column: replacing every chaaracter that is not a letter to a blank, splitting into words,
# and we use stemmer if the word is not a stopword, and than we join it together into a sentence with blank
opinions["cleaned"]=opinions["Review"].apply(lambda x: " ".join(
    [stemmer.stem(i) for i in re.sub("[^a-zA-Z]"," ",x).split() if i not in stop_words]).lower())
opinions["cleaned"].head()

0              so way plug us unless i go convert
1                            good case excel valu
2                                    great jawbon
3    tie charger convers last minut major problem
4                                   the mic great
Name: cleaned, dtype: object

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

x_train,x_test,y_train,y_test=train_test_split(opinions['Review'],opinions["Result"],test_size=0.2)
pip=Pipeline([('vectorizer',TfidfVectorizer(ngram_range=(1, 2),stop_words="english")),
              ('classifier',LinearSVC(random_state=10))
             ])
opinions_model=pip.fit(x_train,y_train)


To perform machine learning on text, we need to present is as a numerical vector, which I obtain with vectorizer. Than I apply Linear SVC model to the data 

In [64]:
from sklearn.model_selection import cross_val_score
y_train_pred=cross_val_score(opinions_model,x_train,y_train,cv=5)
y_train_pred

array([0.7625 , 0.775  , 0.80625, 0.7625 , 0.775  ])

Since cross value by itself does not say much about details of our model successes/faults, we can present results as a confusion matrix, which shows precisely number of properly classified values as well as true values classified as false and reverse. The ideal result would be diagonal matrix

In [65]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
y_train_pred=cross_val_predict(opinions_model,x_train,y_train,cv=5)
confusion_matrix(y_train,y_train_pred)

array([[308,  94],
       [ 85, 313]], dtype=int64)

In [66]:
from sklearn.metrics import precision_score,recall_score
print(precision_score(y_train,y_train_pred))
print(recall_score(y_train,y_train_pred))
print(accuracy_score(y_train,y_train_pred))



0.769041769041769
0.7864321608040201
0.77625


Precision,recall and accuracy tells us more about ratios between true positives and all positives, all true and false positives and general ratio between properly classfied data and all data

In [67]:
opinions_model.predict(["My daughter was really satisfied with her gift"])

array([1], dtype=int64)

In [68]:
opinions_model.predict(["My daughter totally hated her gift, I will never buy it again"])

array([0], dtype=int64)

Just as a test, two similar sentences but with different strongly positively/negatively characterized words

In [69]:
y_train_pred=cross_val_score(opinions_model,x_test,y_test,cv=5)
y_train_pred

array([0.775, 0.8  , 0.625, 0.675, 0.85 ])

In [70]:
y_test_pred=cross_val_predict(opinions_model,x_test,y_test,cv=5)
confusion_matrix(y_test,y_test_pred)

array([[72, 26],
       [25, 77]], dtype=int64)

In [71]:
print(precision_score(y_test,y_test_pred))
print(recall_score(y_test,y_test_pred))
print(accuracy_score(y_test,y_test_pred))

0.7475728155339806
0.7549019607843137
0.745


Similar experiments on the accuracy performed on the test set. As we see, results are slightly worse, but not much different, so I consider it as a good result

In [43]:
second_pip=Pipeline([('vectorizer',TfidfVectorizer(ngram_range=(1, 2),stop_words="english")),
              ('classifier',LinearSVC(penalty='l1',dual=False))
             ])
second_opinions_model=second_pip.fit(x_train,y_train)
second_opinions_model.score(x_test,y_test)


0.785

Tests on the same model, but woth penalty changed to L1. L1 regularization adds a penalty equal to the absolute value of the magnitude of coefficients to the cost function

In [12]:
y_test_pred_second=cross_val_score(second_opinions_model,x_test,y_test,cv=5)
y_test_pred_second

array([0.775, 0.65 , 0.575, 0.75 , 0.775])

In [13]:
y_test_pred_second=cross_val_predict(second_opinions_model,x_test,y_test,cv=5)
confusion_matrix(y_test,y_test_pred_second)

array([[84, 20],
       [39, 57]], dtype=int64)

In [20]:

print(precision_score(y_test,y_test_pred_second))
print(recall_score(y_test,y_test_pred_second))
print(accuracy_score(y_test,y_test_pred_second))


0.7402597402597403
0.59375
0.705


In [33]:
from sklearn.ensemble import GradientBoostingClassifier
pip_gradient=Pipeline([('vectorizer',TfidfVectorizer(ngram_range=(1, 2),stop_words="english",max_df=8)),
              ('classifier', GradientBoostingClassifier(random_state=20))
             ])
opinions_model_gradient=pip_gradient.fit(x_train,y_train)
opinions_model_gradient.score(x_test,y_test)


0.64

Finally, same tests performed on totally different model. As we see, in general I would say the best fit would be the first one, because it's results are much better and consistent than the other ones

In [34]:
y_train_pred_gradient=cross_val_score(opinions_model_gradient,x_test,y_test,cv=5)
y_train_pred_gradient

array([0.7  , 0.675, 0.625, 0.7  , 0.725])

In [35]:
y_test_pred_gradient=cross_val_predict(opinions_model_gradient,x_test,y_test,cv=5)
confusion_matrix(y_test,y_test_pred_gradient)

array([[94, 10],
       [53, 43]], dtype=int64)

In [36]:
print(precision_score(y_test,y_test_pred_gradient))
print(recall_score(y_test,y_test_pred_gradient))
print(accuracy_score(y_test,y_test_pred_gradient))


0.8113207547169812
0.4479166666666667
0.685
