## import pandas library and sklearn

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Put files path

In [2]:
filepath_dict = {'yelp':   'yelp_labelled.txt',
                 'amazon': 'amazon_cells_labelled.txt',
                 'imdb':   'imdb_labelled.txt'}

## Read the data

In [3]:
df_list = [] 

for source, path in filepath_dict.items():
    df = pd.read_csv(path, names=['sentence', 'label'], sep='\t')
    df['source'] = source 
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


## Declare vectorizer

In [4]:
sentences = ['John likes ice cream', 'John hates chocolate.']

In [5]:
# from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer

CountVectorizer(lowercase=False, min_df=0)

## Fitting the data 

In [6]:
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

## Defining Base model for "Yelp"

In [7]:
# from sklearn.model_selection import train_test_split

df_yelp = df[df['source'] == 'yelp']
df_yelp

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
995,I think food should have flavor and texture an...,0,yelp
996,Appetite instantly gone.,0,yelp
997,Overall I was not impressed and would not go b...,0,yelp
998,"The whole experience was underwhelming, and I ...",0,yelp


In [8]:
sentences = df_yelp['sentence'].values

In [9]:
y = df_yelp['label'].values

In [10]:
x_train , x_test , y_train , y_test = train_test_split(sentences, y, test_size=0.25, random_state=100)

In [11]:
vectorizer = CountVectorizer()
vectorizer

vectorizer.fit(x_train)
vectorizer

CountVectorizer()

In [12]:
X_train = vectorizer.transform(x_train)
X_train

<750x1724 sparse matrix of type '<class 'numpy.int64'>'
	with 7422 stored elements in Compressed Sparse Row format>

In [13]:
X_test  = vectorizer.transform(x_test)
X_test

<250x1724 sparse matrix of type '<class 'numpy.int64'>'
	with 2026 stored elements in Compressed Sparse Row format>

In [14]:
# from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
score

print("Accuracy = ", score)

Accuracy =  0.808


## Defining Base model for "Amazon"

In [26]:
df_amazon = df[df['source'] == 'amazon']
df_amazon

Unnamed: 0,sentence,label,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon
...,...,...,...
995,The screen does get smudged easily because it ...,0,amazon
996,What a piece of junk.. I lose more calls on th...,0,amazon
997,Item Does Not Match Picture.,0,amazon
998,The only thing that disappoint me is the infra...,0,amazon


In [15]:

df_amazon = df[df['source'] == 'amazon']
sentences = df_amazon['sentence'].values
y = df_amazon['label'].values

x_train , x_test , y_train , y_test = train_test_split(sentences, y, test_size=0.25, random_state=100)

In [16]:
vectorizer = CountVectorizer()
vectorizer

vectorizer.fit(x_train)
vectorizer

CountVectorizer()

In [17]:
X_train = vectorizer.transform(x_train)
X_train

<750x1551 sparse matrix of type '<class 'numpy.int64'>'
	with 6870 stored elements in Compressed Sparse Row format>

In [18]:
X_test  = vectorizer.transform(x_test)
X_test

<250x1551 sparse matrix of type '<class 'numpy.int64'>'
	with 1945 stored elements in Compressed Sparse Row format>

In [19]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
score

print("Accuracy = ", score)

Accuracy =  0.82


## Defining Base model for "Imdb"

In [None]:
df_imdb = df[df['source'] == 'imdb']
df_imdb

In [20]:
df_imdb = df[df['source'] == 'imdb']
sentences = df_imdb['sentence'].values
y = df_imdb['label'].values

x_train , x_test , y_train , y_test = train_test_split(sentences, y, test_size=0.25, random_state=100)

In [21]:
vectorizer = CountVectorizer()
vectorizer

vectorizer.fit(x_train)
vectorizer

CountVectorizer()

In [22]:
X_train = vectorizer.transform(x_train)
X_train

<561x2517 sparse matrix of type '<class 'numpy.int64'>'
	with 8495 stored elements in Compressed Sparse Row format>

In [23]:
X_test  = vectorizer.transform(x_test)
X_test

<187x2517 sparse matrix of type '<class 'numpy.int64'>'
	with 2310 stored elements in Compressed Sparse Row format>

In [24]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
score

print("Accuracy = ", score)

Accuracy =  0.7379679144385026


In [25]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487
