# Text Classifier

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
filepath_dict = {'yelp' : 'data/yelp_labelled.txt', 'imdb' : 'data/imdb_labelled.txt', 'amazon' : 'data/amazon_cells_labelled.txt' }

In [3]:
filepath_dict['yelp']
'data/yelp_labeled.txt'

'data/yelp_labeled.txt'

In [16]:
df_list = []
for source , filepath in filepath_dict.items() :
    df = pd.read_csv(filepath, names = ['sentence', 'label'], sep = '\t' )
    df['source'] = source
    df_list.append(df)
df_list

[                                              sentence  label source
 0                             Wow... Loved this place.      1   yelp
 1                                   Crust is not good.      0   yelp
 2            Not tasty and the texture was just nasty.      0   yelp
 3    Stopped by during the late May bank holiday of...      1   yelp
 4    The selection on the menu was great and so wer...      1   yelp
 ..                                                 ...    ...    ...
 995  I think food should have flavor and texture an...      0   yelp
 996                           Appetite instantly gone.      0   yelp
 997  Overall I was not impressed and would not go b...      0   yelp
 998  The whole experience was underwhelming, and I ...      0   yelp
 999  Then, as if I hadn't wasted enough of my life ...      0   yelp
 
 [1000 rows x 3 columns],
                                               sentence  label source
 0    A very, very, very slow-moving, aimless movie ...      0

In [17]:
df = pd.concat(df_list)
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
995,The screen does get smudged easily because it ...,0,amazon
996,What a piece of junk.. I lose more calls on th...,0,amazon
997,Item Does Not Match Picture.,0,amazon
998,The only thing that disappoint me is the infra...,0,amazon


In [18]:
df_yelp = df [ df ['source'] == 'yelp']
df_yelp.head()

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [19]:
X = df_yelp['sentence'].values
Y = df_yelp['label'].values
X[:5]

array(['Wow... Loved this place.', 'Crust is not good.',
       'Not tasty and the texture was just nasty.',
       'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.',
       'The selection on the menu was great and so were the prices.'],
      dtype=object)

In [20]:
Y[:5]

array([1, 0, 0, 1, 1])

In [21]:
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.25, random_state=100)
x_train[:10]                                                   

array(['We will not be coming back.',
       'We waited for forty five minutes in vain.',
       "I could barely stomach the meal, but didn't complain because it was a business lunch.",
       "I'd rather eat airline food, seriously.",
       "Needless to say, I won't be going back anytime soon.",
       'For that price I can think of a few place I would have much rather gone.',
       'To my disbelief, each dish qualified as the worst version of these foods I have ever tasted.',
       "I promise they won't disappoint.",
       'The decor is nice, and the piano music soundtrack is pleasant.',
       'Will go back next trip out.'], dtype=object)

## Shape The Data In An Acceptable Shape By Regression Model

In [9]:
vectorizer = CountVectorizer()

In [10]:
vectorizer.fit(x_train)

CountVectorizer()

In [11]:
trans_x_train = vectorizer.transform(x_train)
trans_x_test = vectorizer.transform(x_test)
trans_x_train[:10]

<10x1724 sparse matrix of type '<class 'numpy.int64'>'
	with 89 stored elements in Compressed Sparse Row format>

## Train/Classify The Data Using Logistic Regression

In [12]:
classifier = LogisticRegression()
classifier.fit(trans_x_train, y_train)
score = classifier.score(trans_x_test, y_test)
score

0.808

## Apply 

In [13]:
from sklearn.model_selection import train_test_split
df_yelp = df[df['source'] == 'yelp']
sentences = df_yelp['sentence'].values
y = df_yelp['label'].values
sentences_train , sentences_test , y_train , y_test = train_test_split(sentences, y, test_size = 0.25, random_state = 1000)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [15]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for imdb data: 0.7487
Accuracy for amazon data: 0.7960
