In [9]:
import pandas as pd
import pickle
df1 = pd.read_csv('data.csv')
df1.head()

Unnamed: 0,Rating,Reviews,Target
0,3,It's battery life is great. It's very responsi...,Neutral
1,3,"My fiance had this phone previously, but cause...",Neutral
2,3,unfortunately Sprint could not activate the ph...,Neutral
3,3,the reasons for the 3 star rating was it was i...,Neutral
4,3,"I love the phone, but one problem and one prob...",Neutral


In [10]:
from io import StringIO
col = ['Target', 'Reviews']
df1 = df1[col]
df1 = df1[pd.notnull(df1['Reviews'])]
df1.columns = ['Target', 'Reviews']
df1['category_id'] = df1['Target'].factorize()[0]
category_id_df = df1[['Target', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Target']].values)
#df.head(100)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29990 entries, 0 to 30000
Data columns (total 3 columns):
Target         29990 non-null object
Reviews        29990 non-null object
category_id    29990 non-null int64
dtypes: int64(1), object(2)
memory usage: 937.2+ KB


In [11]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
df1.groupby('Target').Reviews.count().plot.bar(ylim=0)
plt.show()

<Figure size 800x600 with 1 Axes>

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf1 = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
#tfidf = TfidfVectorizer(sublinear_tf=True, stop_words='english')
features1 = tfidf1.fit_transform(df1.Reviews).toarray()
labels = df1.category_id
features1.shape

(29990, 29054)

In [13]:
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for Target, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features1, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf1.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Target))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'Negative':
  . Most correlated unigrams:
. great
. good
  . Most correlated bigrams:
. great phone
. didn work
# 'Neutral':
  . Most correlated unigrams:
. blu
. ok
  . Most correlated bigrams:
. blu phone
. ok phone
# 'Positive':
  . Most correlated unigrams:
. excellent
. great
  . Most correlated bigrams:
. great phone
. works great


### Support Vector Machines ###

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
X_train, X_test, y_train, y_test = train_test_split(df1['Reviews'], df1['Target'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = LinearSVC().fit(X_train_tfidf, y_train)

In [26]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred=clf.predict(count_vect.transform(X_test))
#y_pred=final.predict("An obvious vanity press for Julie in her first movie with Blake. Let's see. Where do we begin. She is a traitor during a world war; she redeems that by falling in love; her friends (who are presumably patriots because they are German citizens) are expendable and must die; and she winds up as a heroine. OK. The scenes with the drunken pilot and the buffoons who work for French intelligence can't even be described, and we won't even mention Rock's romantic scenes with a female. (By the way, when they visit a museum, look at his gaze - I reran it on video and it's priceless). Is it a farce or is it a romantic classic or is it a war movie? I don't know and you won't either.")

print("-----CONFUSTION MATRIX-----")

print(confusion_matrix(y_test, y_pred))

print("-----CLASSIFICATION REPORT-----")

print(classification_report(y_test,y_pred))

-----CONFUSTION MATRIX-----
[[2129  285   62]
 [ 294 1987  209]
 [ 123  391 2018]]
-----CLASSIFICATION REPORT-----
              precision    recall  f1-score   support

    Negative       0.84      0.86      0.85      2476
     Neutral       0.75      0.80      0.77      2490
    Positive       0.88      0.80      0.84      2532

    accuracy                           0.82      7498
   macro avg       0.82      0.82      0.82      7498
weighted avg       0.82      0.82      0.82      7498



In [34]:
#Testing Prediction

print(clf.predict(count_vect.transform(["The phone is okay"])))

['Neutral']
