<h1>NLP</h1>

In [106]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
text = " 1st This is Nitin's text, isn't it?"

In [6]:
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokenizer.tokenize(text)

['1st', 'This', 'is', "Nitin's", 'text,', "isn't", 'it?']

In [7]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokenizer.tokenize(text)

['1st', 'This', 'is', 'Nitin', "'s", 'text', ',', 'is', "n't", 'it', '?']

In [8]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
tokenizer.tokenize(text)

['1st',
 'This',
 'is',
 'Nitin',
 "'",
 's',
 'text',
 ',',
 'isn',
 "'",
 't',
 'it',
 '?']

In [89]:
text = "feet cats wolves talked"

In [90]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)

In [91]:
stemmer = nltk.stem.WordNetLemmatizer()
" ".join(stemmer.lemmatize(token) for token in tokens)

'foot cat wolf talked'

<h2>TF and IDF</h2> <br>
$tf(t,d)=f_{t,d}$ <br>
$idf(t,D)=log(\frac{N}{|\{d \in D:t \in d\}|})$ <br>
$tfidf(t, d,D) = tf(t,d) . idf(t,D) $

A high weight in TF-IDF is reached by a high term frequency (in the given document) and a low document frequency of the term in the whole collection of documents.

In [92]:
N = 4
term1_n = 3
term2_n = 1
term1_d1_f = 1
term1_d2_f = 10
term1_d3_f = 50
term1_d4_f = 0
term2_d1_f = 8
term2_d2_f = 0
term2_d3_f = 0
term2_d4_f = 0

In [93]:
print("Term 1 in document 1: ", np.log(N/term1_n)*term1_d1_f)
print("Term 1 in document 2: ", np.log(N/term1_n)*term1_d2_f)
print("Term 1 in document 3: ", np.log(N/term1_n)*term1_d3_f)
print("Term 1 in document 4: ", np.log(N/term1_n)*term1_d4_f)
print("-------------------------------------------------------")
print("Term 2 in document 1: ", np.log(N/term2_n)*term2_d1_f)
print("Term 2 in document 2: ", np.log(N/term2_n)*term2_d2_f)
print("Term 2 in document 3: ", np.log(N/term2_n)*term2_d3_f)
print("Term 2 in document 4: ", np.log(N/term2_n)*term2_d4_f)

Term 1 in document 1:  0.287682072452
Term 1 in document 2:  2.87682072452
Term 1 in document 3:  14.3841036226
Term 1 in document 4:  0.0
-------------------------------------------------------
Term 2 in document 1:  11.090354889
Term 2 in document 2:  0.0
Term 2 in document 3:  0.0
Term 2 in document 4:  0.0


In [16]:
def preProcess(words):
    return re.sub(r'\w*\d\w*', '', words).strip().lower()

In [17]:
texts = ["good movie", "not a Good movie", "did not like 1st", "i like it", "good one 1st"]

tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2), preprocessor=preProcess) 
features = tfidf.fit_transform(texts)
pd.DataFrame(features.todense(), columns=tfidf.get_feature_names())

Unnamed: 0,good movie,like,movie,not
0,0.707107,0.0,0.707107,0.0
1,0.57735,0.0,0.57735,0.57735
2,0.0,0.707107,0.0,0.707107
3,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0


In [95]:
a = tfidf.transform(["it not Nitin movie not", "Nitin", "not a good movie", "like it", "not not not"])

In [96]:
print(a.todense())

[[ 0.          0.          0.4472136   0.89442719]
 [ 0.          0.          0.          0.        ]
 [ 0.57735027  0.          0.57735027  0.57735027]
 [ 0.          1.          0.          0.        ]
 [ 0.          0.          0.          1.        ]]


In [12]:
df = pd.read_csv('movie_data.csv')

In [13]:
train = df[:25000]
test = df[25000:]

In [45]:
X_train = train[['review']]
X_test = test[['review']]
y_train = train['sentiment']
y_test = test['sentiment']

In [46]:
tfidf = TfidfVectorizer(min_df=5, ngram_range=(1, 2), preprocessor=preProcess) 
train_features = tfidf.fit_transform(X_train['review'])
test_features = tfidf.transform(X_test['review'])

In [49]:
clf = LogisticRegression()

In [None]:
clf.fit(train_features, y_train)

In [51]:
y_predict = clf.predict(test_features)

In [101]:
print("Accuracy: ", round(accuracy_score(y_test, y_predict)*100, 1))

Accuracy:  89.7


In [98]:
ind_max = list(np.argsort(clf.coef_[0])[-10:])
ind_min = list(np.argsort(clf.coef_[0])[:10])

In [103]:
feature_names = tfidf.get_feature_names()

In [105]:
top_10 = [feature_names[i] for i in ind_max]
bottom_10 = [feature_names[i] for i in ind_min]
print('Top 10 positive words based on learned weights:')
print(top_10)
print('Top 10 negative words based on learned weights:')
print(bottom_10)

Top 10 positive words based on learned weights:
['brilliant', 'love', 'best', 'the best', 'amazing', 'wonderful', 'perfect', 'and', 'excellent', 'great']
Top 10 negative words based on learned weights:
['bad', 'worst', 'the worst', 'awful', 'boring', 'no', 'terrible', 'poor', 'waste', 'nothing']
