<a href="https://colab.research.google.com/github/plaban1981/NLP-with-Python/blob/master/Text_Classification_Using_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Movie Review Classification

In [0]:
import pandas as pd
import numpy as np

## Read Movie Reviews text file

In [2]:
df = pd.read_csv('/content/moviereviews.tsv',sep ='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [3]:
df.shape

(2000, 2)

In [5]:
print(df['review'][0])

how do films like mouse hunt get into theatres ? 
isn't there a law or something ? 
this diabolical load of claptrap from steven speilberg's dreamworks studio is hollywood family fare at its deadly worst . 
mouse hunt takes the bare threads of a plot and tries to prop it up with overacting and flat-out stupid slapstick that makes comedies like jingle all the way look decent by comparison . 
writer adam rifkin and director gore verbinski are the names chiefly responsible for this swill . 
the plot , for what its worth , concerns two brothers ( nathan lane and an appalling lee evens ) who inherit a poorly run string factory and a seemingly worthless house from their eccentric father . 
deciding to check out the long-abandoned house , they soon learn that it's worth a fortune and set about selling it in auction to the highest bidder . 
but battling them at every turn is a very smart mouse , happy with his run-down little abode and wanting it to stay that way . 
the story alternate

##Check Null Values

In [6]:
df.isnull().sum()[df.isnull().sum() > 0]

review    35
dtype: int64

## Drop NAN values

In [0]:
df.dropna(inplace = True)

##Check if null exists

In [8]:
df.isnull().sum()[df.isnull().sum() > 0]

Series([], dtype: int64)

## Remove empty Reviews - whitespaces

In [0]:
blanks = []
#iterate for index,label,reviews in the dataframe
for ix,lb,rv in df.itertuples():
  if rv.isspace():
    blanks.append(ix)

## The following index positions have blank reviews

In [11]:
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [0]:
df.drop(blanks,inplace= True)

In [13]:
df.shape

(1938, 2)

## Split the data into training set and test set

In [0]:
X = df['review']
Y = df['label']

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.15,random_state=1)
print(X_train.shape)
print(X_test.shape)

(1647,)
(291,)


## Create Pipeline to vectorize the data and train and fit the model

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_pipeline = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

##Train the Pipeline by initiating fit method

In [20]:
text_pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

##Predictions

In [0]:
y_pred = text_pipeline.predict(X_test)

In [22]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         neg       0.84      0.86      0.85       149
         pos       0.85      0.83      0.84       142

    accuracy                           0.85       291
   macro avg       0.85      0.85      0.85       291
weighted avg       0.85      0.85      0.85       291



In [23]:
print(confusion_matrix(y_test,y_pred))

[[128  21]
 [ 24 118]]


##Accuracy Score

In [24]:
print(accuracy_score(y_test,y_pred))

0.845360824742268


## Classifing the review as pos - positive or neg -negative over unseen movie review

In [29]:
text_pipeline.predict(['The movie was biased in the presentation.\n Did not like the ending.'])

array(['neg'], dtype=object)

In [30]:
text_pipeline.predict(['Frozen is a sweet adorable representation of the bond between two sisters. The movie is amazing'])

array(['pos'], dtype=object)

##Text Classification Assesment - movierevievs2.tsv

####Task #1 :Perform imports and load datasets into Pandas DataFrame

In [31]:
df1 = pd.read_csv('/content/moviereviews2.tsv',sep='\t')
df1.head()

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...


##Task#2 Check for missing values

In [34]:
df1.isnull().sum()[df1.isnull().sum() > 0]

review    20
dtype: int64

##Drop missing Values

In [0]:
df1.dropna(inplace=True)

##Check if the NAN values have been eliminated

In [36]:
df1.isnull().sum()[df1.isnull().sum() > 0]

Series([], dtype: int64)

####Check for whitespaces

In [0]:
blanks = []
for ix,lb,rv in df1.itertuples():
  if type(rv) == str:
    if rv.isspace():
      blanks.append(ix)

In [41]:
print(len(blanks))

0


#### That implies all reviews are populated and there are no whitespaces

## Task #4 :Take a quick look at the label columns

In [43]:
df1['label'].value_counts()

neg    2990
pos    2990
Name: label, dtype: int64

## Task #5 : Split the data into train and test sets

In [0]:
from sklearn.model_selection import train_test_split
Y = df1['label']
X = df1['review']
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.33,random_state=42)

In [72]:
print(X_train.shape)
print(X_test.shape)

(4006,)
(1974,)


In [80]:
y_train.iloc[0]

'neg'

In [79]:
y_test.iloc[0]

'neg'

## Task #6 - Build a pipeline to vectorize the review and train and fit a model

In [81]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_pipeline_1 = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
text_pipeline_1.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

## Task #7 : Run Predictions and Analyze the Results

In [0]:
predictions = text_pipeline_1.predict(X_test)

In [84]:
predictions[0]

'neg'

#### Report Confusion Matrix

In [86]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

array([[900,  91],
       [ 63, 920]])

##Classification Report

In [88]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.93      0.91      0.92       991
         pos       0.91      0.94      0.92       983

    accuracy                           0.92      1974
   macro avg       0.92      0.92      0.92      1974
weighted avg       0.92      0.92      0.92      1974



##Accuracy Score

In [89]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,predictions))

0.9219858156028369
