# Topic 3: Text Classification

## Sentiment Analysis

Predicting if a sentence is a positive sentiment or negative sentiment

In [1]:
import pandas as pd

In [4]:
sentiment_df = pd.read_csv('../data/sentiment_train', sep = '\t')

In [5]:
pd.set_option('max_colwidth', 200)

In [6]:
sentiment_df.sample(4)

Unnamed: 0,sentiment,text
6772,0,"Oh, and Brokeback Mountain is a TERRIBLE movie..."
6893,0,Brokeback Mountain was boring.
6903,0,Brokeback Mountain was boring.
4691,0,"Da Vinci Code = Up, Up, Down, Down, Left, Right, Left, Right, B, A, SUCK!"


In [7]:
sentiment_df.sentiment.value_counts()

1    3943
0    2975
Name: sentiment, dtype: int64

In [8]:
sentiment_df.shape

(6918, 2)

### Vectorizing the sentences

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [10]:
count_vec = CountVectorizer(min_df = 3,
                            max_df = 0.8)

In [11]:
count_vec.fit(sentiment_df.text)

CountVectorizer(max_df=0.8, min_df=3)

In [12]:
train_ds = count_vec.transform(sentiment_df.text)

In [13]:
train_ds

<6918x592 sparse matrix of type '<class 'numpy.int64'>'
	with 63561 stored elements in Compressed Sparse Row format>

In [14]:
feature_names = count_vec.get_feature_names()

In [15]:
len(feature_names)

592

In [16]:
train_ds.getnnz()/(592*6918)

0.015519883500152364

### Frequency of words

Finding out high frequency and low frequency words.

In [20]:
import numpy as np

In [21]:
feature_freq = np.sum(train_ds.toarray(), axis = 0)

In [22]:
features_df = pd.DataFrame( {'feature': feature_names,
                             'frequency': feature_freq} )

In [23]:
features_df.sort_values('frequency', ascending=False)[0:20]

Unnamed: 0,feature,frequency
500,the,3306
23,and,2154
223,harry,2093
403,potter,2093
88,code,2002
544,vinci,2001
112,da,2001
341,mountain,2000
69,brokeback,2000
312,love,1624


In [24]:
from sklearn.feature_extraction import text

In [25]:
stopwords = text.ENGLISH_STOP_WORDS

In [26]:
stopwords = stopwords.union(['movie'])

### New Count Vectorizer

In [27]:
count_vec = CountVectorizer(stop_words = stopwords,
                            min_df = 3,
                            max_df = 0.8)

In [28]:
count_vec.fit(sentiment_df.text)
train_ds = count_vec.transform(sentiment_df.text)
feature_names = count_vec.get_feature_names()
feature_freq = np.sum(train_ds.toarray(), axis = 0)
features_df = pd.DataFrame( {'feature': feature_names,
                             'frequency': feature_freq} )

In [31]:
len(feature_names)

435

### Splitting the dataset

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X_train, X_test, \
y_train, y_test = train_test_split(train_ds,
                                   sentiment_df.sentiment,
                                   train_size = 0.8)

In [34]:
X_train.shape

(5534, 435)

In [35]:
X_test.shape

(1384, 435)

### Build  a Naive Bayes Model

In [36]:
from sklearn.naive_bayes import BernoulliNB

In [37]:
nb = BernoulliNB()

In [38]:
nb.fit(X_train, y_train)

BernoulliNB()

In [39]:
y_pred_prob = nb.predict_proba(X_test)

In [40]:
y_pred_prob[0:3]

array([[2.28985527e-06, 9.99997710e-01],
       [6.22652959e-03, 9.93773470e-01],
       [9.98657279e-01, 1.34272069e-03]])

In [41]:
from sklearn.metrics import roc_auc_score

In [42]:
roc_auc_score(y_test, y_pred_prob[:,1])

0.9964044695209646

In [43]:
y_pred = nb.predict(X_test)

In [44]:
from sklearn.metrics import confusion_matrix, classification_report

In [45]:
confusion_matrix(y_test, y_pred, [1,0])



array([[782,   4],
       [ 12, 586]])

In [46]:
y_df = pd.DataFrame( { 'actual': y_test,
                       'predicted': y_pred } )

In [47]:
y_df[(y_df.actual == 1) & (y_df.predicted == 0)]

Unnamed: 0,actual,predicted
2031,1,0
2170,1,0
2055,1,0
3098,1,0


In [48]:
pd.set_option('max_colwidth', 800)

In [49]:
sentiment_df[2984:2985]['text']

2984    I was thinking we should have a gay cowboy party in honor of Val's starring performance in Titus.: ) And because I love Brokeback Mountain..
Name: text, dtype: object

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
rf = RandomForestClassifier(n_estimators = 100,
                            max_features = 0.3,
                            max_depth = 9)

In [52]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=9, max_features=0.3)

In [53]:
feaures_imp = pd.DataFrame({'feature': feature_names,
                            'importance': rf.feature_importances_})

In [54]:
feaures_imp.sort_values('importance', ascending = False)[0:20]

Unnamed: 0,feature,importance
245,love,0.249427
19,awesome,0.180468
365,sucked,0.072772
246,loved,0.063912
367,sucks,0.060527
230,like,0.058647
198,impossible,0.049569
263,mission,0.047895
173,hate,0.044573
24,beautiful,0.028983


### Apply stemming

In [55]:
from nltk.stem import PorterStemmer

In [56]:
stemmer = PorterStemmer()
analyzer = TfidfVectorizer().build_analyzer()

In [57]:
def stem_docs(doc):
    non_stop_words = [word for word in analyzer(doc) 
                      if word not in stopwords]    
    stemmed_words = [stemmer.stem(w) for w in non_stop_words]
    return stemmed_words

In [58]:
count_vec = TfidfVectorizer(analyzer=stem_docs,
                            min_df = 3,
                            max_df = 0.8)

In [59]:
count_vec.fit(sentiment_df.text)

TfidfVectorizer(analyzer=<function stem_docs at 0x117bd8700>, max_df=0.8,
                min_df=3)

In [60]:
train_ds = count_vec.transform(sentiment_df.text)

In [61]:
feature_names = count_vec.get_feature_names()

In [62]:
len(feature_names)

431

In [63]:
X_train, X_test, \
y_train, y_test = train_test_split(train_ds,
                                   sentiment_df.sentiment,
                                   train_size = 0.8)

In [64]:
rf = RandomForestClassifier(n_estimators = 100,
                            max_features = 0.3,
                            max_depth = 9)

In [65]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=9, max_features=0.3)

In [66]:
y_pred_prob = rf.predict_proba(X_test)

In [67]:
roc_auc_score(y_test, y_pred_prob[:,1])

0.999418854005833