work with text data

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn import metrics

In [23]:
df = pd.read_csv("train.csv")

In [24]:
df.shape

(3097, 4)

In [25]:
df.head()

Unnamed: 0,benefits_review,side_effects_review,comments_review,rating
0,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ...",4
1,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest...",1
2,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...,10
3,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...,3
4,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above,2


In [26]:
df.isna().sum()

benefits_review        0
side_effects_review    0
comments_review        0
rating                 0
dtype: int64

In [27]:
df.rating.value_counts()

10    741
8     555
9     479
7     350
1     304
5     159
6     156
3     145
4     105
2     103
Name: rating, dtype: int64

In [28]:
df['review'] = df['benefits_review'] + ' ' + df['side_effects_review'] + ' ' + df['comments_review']

In [29]:
X = df.loc[:,('review')]
y = df.rating

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [31]:
y_train.value_counts()

10    519
8     388
9     335
7     245
1     213
5     111
6     109
3     101
4      74
2      72
Name: rating, dtype: int64

In [32]:
X_train_docs = [doc for doc in X_train]

In [33]:
# use unigrams and biagrams
vect = CountVectorizer(ngram_range=(1, 2), stop_words="english", max_features=1000).fit(X_train_docs)

In [34]:
X_train_tr = vect.transform(X_train_docs)

In [35]:
vect.get_feature_names()[:20]



['10',
 '10 days',
 '10 mg',
 '10 years',
 '100',
 '100 mg',
 '100mg',
 '10mg',
 '11',
 '12',
 '15',
 '150',
 '150mg',
 '20',
 '20 mg',
 '20 minutes',
 '200',
 '200mg',
 '20mg',
 '24']

In [36]:
X_train_tr.toarray()[:5, :20]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [37]:
X_train_tr.toarray()[:5, :-10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]])

train model using LinearSVC classifier

In [38]:
# check out LinearSVC
lin_svc = LinearSVC(max_iter=120000)

In [39]:
scores = cross_val_score(lin_svc, X_train_tr, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.20


In [40]:
lin_svc.fit(X_train_tr, y_train)

LinearSVC(max_iter=120000)

validate model using test set

In [41]:
X_test_docs = [doc for doc in X_test]
X_test_features = vect.transform(X_test_docs)

In [42]:
y_test_pred = lin_svc.predict(X_test_features)

In [43]:
metrics.accuracy_score(y_test, y_test_pred)

0.22258064516129034