In [1]:
import gzip
import itertools
import string

import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import pylab as pl
import nltk


from collections import Counter
from sklearn import svm
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer


%matplotlib inline

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet
from sklearn import metrics
from collections import defaultdict

In [5]:
df=pd.read_csv('dataset/train.csv')
df.head()

Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...


In [6]:
text = df['text']

In [7]:
text.count()

1048575

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mohwa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
stop_words = stopwords.words('english')

In [10]:
wordnet_lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'[a-z]+')

def preprocess(document):
    document = document.lower() # Convert to lowercase
    words = tokenizer.tokenize(document) # Tokenize
    words = [w for w in words if not w in stop_words] # Removing stopwords
    # Lemmatizing
    for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]:
        words = [wordnet_lemmatizer.lemmatize(x, pos) for x in words]
    return " ".join(words)



In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mohwa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
df['processed_text'] = df['text'].apply(preprocess)

In [13]:
df.head()

Unnamed: 0,polarity,title,text,processed_text
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,read lot review say best game soundtrack figur...
1,2,Amazing!,This soundtrack is my favorite music of all ti...,soundtrack favorite music time hand intense sa...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...,truly like soundtrack enjoy video game music p...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine...",play game know divine music every single song ...
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...,quite sure actually take time read play game l...


In [14]:
processed_text = df["processed_text"]

In [15]:
x_train, x_test, y_train, y_test = train_test_split(df.processed_text, df.polarity, random_state=0)

In [16]:
print("x_train shape: {}".format(x_train.shape), end='\n')
print("y_train shape: {}".format(y_train.shape), end='\n\n')
print("x_test shape: {}".format(x_test.shape), end='\n')
print("y_test shape: {}".format(y_test.shape), end='\n\n')

x_train shape: (786431,)
y_train shape: (786431,)

x_test shape: (262144,)
y_test shape: (262144,)



In [17]:
y_train

298025    2
742155    1
603940    2
801012    2
112780    1
         ..
963395    2
117952    2
435829    2
305711    2
985772    1
Name: polarity, Length: 786431, dtype: int64

In [None]:
# Fitting and transforming the training data to a document-term matrix using TfidfVectorizer 
tfidf = TfidfVectorizer(min_df=5) #minimum document frequency of 5
X_train_tfidf = tfidf.fit_transform(x_train)
print("Number of features : %d \n" %len(tfidf.get_feature_names())) #1722
print("Show some feature names : \n", tfidf.get_feature_names()[::1000])

# Decision Tree
lr = DecisionTreeClassifier()
lr.fit(X_train_tfidf, y_train)

Number of features : 59468 

Show some feature names : 
 ['aa', 'agitation', 'android', 'ascot', 'balaclava', 'beowolf', 'bonfa', 'buggy', 'carthage', 'chirstmas', 'colour', 'convulted', 'cuisnart', 'degredation', 'diligently', 'dotcom', 'effie', 'epson', 'expresion', 'filmographies', 'frederic', 'genus', 'grapelli', 'happly', 'hindustani', 'hyrule', 'inflammation', 'isight', 'juste', 'kroeger', 'lexicon', 'ludicrously', 'marraige', 'merriment', 'molest', 'mz', 'nonpoint', 'onward', 'paly', 'personhood', 'poisonous', 'prise', 'queeg', 'recreate', 'retailer', 'roys', 'schiffman', 'shallot', 'skinner', 'spacy', 'stinkeroo', 'surpises', 'tempature', 'tinkertoy', 'trolley', 'uneeded', 'valancy', 'waaaaaaaaay', 'wil', 'yen']


In [None]:
feature_names = np.array(tfidf.get_feature_names())
sorted_coef_index = lr.coef_[0].argsort()
print('\nTop 10 features with the smallest coefficients :\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Top 10 features with the largest coefficients : \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

In [None]:
def modelEvaluation(predictions):
    '''
    Print model evaluation to predicted result 
    '''
    print ("\nAccuracy on validation set: {:.4f}".format(accuracy_score(y_test, predictions)))
    #print("\nAUC score : {:.4f}".format(roc_auc_score(y_test, predictions)))
    print("\nClassification report : \n", metrics.classification_report(y_test, predictions))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(y_test, predictions))

In [None]:
predictions = lr.predict(tfidf.transform(x_test))
modelEvaluation(predictions)

In [None]:

cfm = confusion_matrix(y_test, predictions)
plt.imshow(cfm, interpolation='nearest')

for i, j in itertools.product(range(cfm.shape[0]), range(cfm.shape[1])):
    plt.text(j, i, cfm[i, j],
             horizontalalignment="center",
             color="white")

plt.ylabel('True label (Recall)')
plt.xlabel('Predicted label (Precision)')
plt.title('Decision Tree with TFIDF | Confusion Matrix')
plt.colorbar();