 <h2 align="center">Logistic Regression: A Sentiment Analysis Case Study</h2>

### Loading the dataset
---

In [1]:
#importing the pandas library
import pandas as pd
#loading the dataset
df = pd.read_csv('data/movie_data.csv')
df.head()


Unnamed: 0,review,sentiment
0,This movie is just crap. Even though the direc...,0
1,Another detailed work on the subject by Dr Dwi...,1
2,THE CAT O'NINE TAILS (Il Gatto a Nove Code) <b...,0
3,"Like with any movie genre, there are good gang...",0
4,I watched it with my mom and we were like...<b...,0


###  Transforming documents into feature vectors

In [2]:
#importing libraries
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
#creating a CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining',
                'The weather is sweet',
                'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

In [3]:
#printing the vocabulary
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [4]:
#printing the feature vector
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


Raw term frequencies: *tf (t,d)*—the number of times a term t occurs in a document *d*

### Word relevancy using term frequency-inverse document frequency

In [5]:
from sklearn.feature_extraction.text  import TfidfTransformer
#setting numpy presion to 2 decimal places
np.set_printoptions(precision=2)
#creating a TfidTransformation
Tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
print(Tfidf.fit_transform(bag).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


###  Data Preparation

In [6]:
import re
#definnig a function to strip all the html tags from the text and 
#moving all the emoticons to the end of the text 
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [7]:
df['review'] = df['review'].apply(preprocessor)

###  Tokenization of documents

In [8]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

In [9]:
def tokenizer (text):
    return text.split()

In [10]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [11]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/anmol/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

###  Transform Text Data into TF-IDF Vectors

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

Tfidvector = TfidfVectorizer(strip_accents=None,
                            lowercase=False,
                            preprocessor = None,
                            tokenizer = tokenizer_porter,
                            use_idf =True,
                            norm = 'l2',
                            smooth_idf= True)
y = df.sentiment.values
x = Tfidvector.fit_transform(df.review)

###  Document Classification using Logistic Regression

In [13]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.5, random_state = 1, shuffle = False)

In [14]:
import pickle
from sklearn.linear_model import LogisticRegressionCV
#creating the regresion model for the prediction
clf = LogisticRegressionCV(cv =5,scoring='accuracy',random_state = 0, n_jobs =-1 , verbose =3,
                           max_iter= 300).fit(x_test,y_test)
#saving the created model for future use using the pickel library
saved_model = open('trained_model.sav','wb')
pickle.dump(clf,saved_model)
saved_model.close()


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   59.2s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.0min finished


###  Model Evaluation

In [15]:
#loading the model back from the saved file
file = 'trained_model.sav'
new_clf = pickle.load(open(file,'rb'))

In [16]:
#testing the accuracy of the model
clf.score(x_test, y_test)

0.9520912547528517