Choosing a Data Set

In [1]:
import pandas as pd

Load the data using pandas library

In [2]:
yelp_df = pd.read_csv('data/yelp_labelled.txt',names=['sentence', 'label'],sep='\t')
yelp_df

Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


Split data in train and test sets

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
sentences = yelp_df['sentence'].values
y = yelp_df['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences,y,test_size=0.25,random_state=1000)
len(sentences_train), len(sentences_test)

(750, 250)

Convert data and change it from text to binary vector

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
vectorizer = CountVectorizer()

In [7]:
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [8]:
X_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Fit the model using training data set and the use the test data set to claculate the accurecy for the fitting model.

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
classifier = LogisticRegression() 

In [11]:
classifier.fit(X_train,y_train)
classifier.score(X_test,y_test)

0.796

In [12]:
filepath_dict = {'yelp':   'data/yelp_labelled.txt',
                 'amazon': 'data/amazon_cells_labelled.txt',
                 'imdb':   'data/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [13]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487


Test the models with my own sentences I will test Imdb model

In [14]:
my_sentences = ['This movie was nice', 'This movie was terrifying']
my_sentences_trans = vectorizer.transform(my_sentences)
classifier.predict(my_sentences_trans)

array([1, 0])