# Sentiment analysis

## Loading Data

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [2]:
train = pd.read_csv("train.csv", sep='|')
test = pd.read_csv("test.csv", sep='|')

In [3]:
train.head()

Unnamed: 0,overall,reviewText
0,0,Entertaining enough for those who don't think ...
1,1,I bought it yesterday havent started watching ...
2,1,This movie tells the story of three kids who g...
3,1,You wanna know what its like for a Black perso...
4,1,Warner Archive has finally released an epic fi...


##  Classifier

In [4]:
from sklearn.metrics import f1_score

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
# Changed tokenizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                              analyzer = 'word', binary = True, max_df = 0.85, vocabulary=None)

In [7]:
# Works worse than TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(
                    analyzer='word', 
                    n_features=6, # only 6 bins will be used as columns
                    non_negative=True, 
                    norm=None
                    )

In [8]:
# To check SVM 
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

svm = LinearSVC(C=5)
clf = CalibratedClassifierCV(svm)

In [9]:
# Main model
pipeline_lr = Pipeline([('vectorizer', vectorizer),
                     ('clf_lr', clf)])

In [10]:
scores = cross_val_score(pipeline_lr, train.reviewText, train.overall, cv=4, n_jobs=-1, scoring="roc_auc")

In [11]:
# Validation score
scores.mean()

0.96492020266666667

In [12]:
pipeline_lr.fit(train["reviewText"],train["overall"])

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.85, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=Tr... penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv=3, method='sigmoid'))])

In [13]:
pipeline_lr.predict_proba(test["reviewText"])[:,1]

array([ 0.62879675,  0.67365091,  0.00087392, ...,  0.6233506 ,
        0.00411992,  0.26950722])

In [14]:
test["overall"] = pipeline_lr.predict_proba(test["reviewText"])[:,1]

In [15]:
test[["index","overall"]].head()

Unnamed: 0,index,overall
0,0,0.628797
1,1,0.673651
2,2,0.000874
3,3,0.996993
4,4,0.061616


In [16]:
# test[["index","overall"]].to_csv("LogRegr_TFidf.csv", index=False)