In [12]:
import sklearn
import pickle
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [13]:
sentiment_data = pd.read_csv('datasets/sentimental_analysis_data.csv',
                            header=None,
                            names=['Label','Text'],
                            sep='\t')
sentiment_data.sample(10)

Unnamed: 0,Label,Text
5015,0,How?.-Mission Impossible III is really boring ...
6298,0,Brokeback Mountain is fucking horrible..
1288,1,we're gonna like watch Mission Impossible or H...
5143,0,"I hate you Harry Potter, you fucking whiny lit..."
1050,1,mission impossible III was awesome...
1522,1,Mission Impossible 3 was excellent.
171,1,i loved the da vinci code was a kick ass movie...
1260,1,i love being a sentry for mission impossible a...
2797,1,I love Harry Potter..
4938,0,I hate Mission Impossible.


In [14]:
sentiment_data.shape

(6918, 2)

In [15]:
X = sentiment_data['Text']
Y = sentiment_data['Label']

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [17]:
tfidf_vect = TfidfVectorizer(max_features=15)

In [18]:
logistic_clf = LogisticRegression(solver='liblinear')

In [19]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', logistic_clf)])
pipeline_model = clf_pipeline.fit(x_train, y_train)

In [20]:
y_pred = pipeline_model.predict(x_test)

Accuracy_score = accuracy_score(y_test, y_pred)
Accuracy_score

0.8995664739884393

In [21]:
pickle.dump(pipeline_model, open('models/logistic_clf/model.pkl', 'wb'))

## Second Model

In [22]:
decision_tree_clf = DecisionTreeClassifier(max_depth=10)

In [23]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', decision_tree_clf)])
pipeline_model = clf_pipeline.fit(x_train, y_train)

In [24]:
y_pred = pipeline_model.predict(x_test)

Accuracy_score = accuracy_score(y_test, y_pred)
Accuracy_score

0.9010115606936416

In [26]:
pickle.dump(pipeline_model, open('models/decision_tree_clf/model.pkl', 'wb'))

## Third Model 

In [27]:
linear_svc_clf = LinearSVC(C=1, max_iter=100)

clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', linear_svc_clf)])
pipeline_model = clf_pipeline.fit(x_train, y_train)

In [28]:
y_pred = pipeline_model.predict(x_test)

Accuracy_score = accuracy_score(y_test, y_pred)
Accuracy_score

0.8988439306358381

In [29]:
pickle.dump(pipeline_model, open('models/linear_svc_clf/model.pkl', 'wb'))