In [1]:
import pandas as pd

In [2]:
filepath_dict = {'yelp': './yelp_labelled.txt',
                'amazon': './amazon_cells_labelled.txt',
                'imdb': './imdb_labelled.txt'}

In [3]:
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentences', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)
    print(source)
df = pd.concat(df_list)
df
# df.info()


yelp
amazon
imdb


Unnamed: 0,sentences,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


In [4]:
from sklearn.model_selection import train_test_split

In [41]:
df_yelp = df[df['source'] == 'yelp']

sentence = df_yelp['sentences'].values
y = df_yelp['label'].values

In [6]:
sentence_train, sentence_test, y_train, y_test = train_test_split(sentence, y, test_size=0.25, random_state=1000)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vectorizer = CountVectorizer()

In [28]:
vectorizer.fit(sentence_train)

CountVectorizer()

In [39]:
X_train = vectorizer.transform(sentence_train)
X_test = vectorizer.transform(sentence_test)
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
classifier = LogisticRegression()

In [13]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [14]:
score = classifier.score(X_test, y_test)

In [15]:
print('Accuracy:', score)

Accuracy: 0.796


In [19]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentences'].values
    y = df_source['label'].values
    
    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487


In [44]:
my_sentence = ['Great quality product', 'I really love how you can do anything but cook', 'The movie was really bad']
my_label = [1, 0, 0]
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentences'].values
    y = df_source['label'].values
    
    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(my_sentence)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, my_label)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.6667
Accuracy for amazon data: 0.6667
Accuracy for imdb data: 1.0000
