Following the text classification tutorial: https://realpython.com/python-keras-text-classification/?__s=36ducjqths8yvwkmpwbm

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
datafolder = Path("data/sentiment labelled sentences")
paths = {'yelp': "yelp_labelled.txt",\
         'amazon': "amazon_cells_labelled.txt",\
         'imdb': "imdb_labelled.txt"}

dfs = []
for source, path in paths.items():
    df = pd.read_csv(datafolder/path, names=['sentence', 'label'], sep='\t', header=None)
    df['source'] = source
    dfs.append(df)

df_all = pd.concat(dfs)
#df_all = df_all.reset_index()
#df_all = df_all.drop(columns='index')

In [3]:
df_all.iloc[999:1010,:]

Unnamed: 0,sentence,label,source
999,"Then, as if I hadn't wasted enough of my life ...",0,yelp
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon
5,I have to jiggle the plug to get it to line up...,0,amazon
6,If you have several dozen or several hundred c...,0,amazon
7,If you are Razr owner...you must have this!,1,amazon
8,"Needless to say, I wasted my money.",0,amazon


In [11]:
from sklearn.model_selection import train_test_split

df_yelp = df_all[df_all.source == 'yelp']

sentences = df_yelp.sentence.values
y = df_yelp.label.values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size = 0.3, random_state=13) 

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(sentences_train)
X_test = vectorizer.transform(sentences_test)

In [13]:
X_train

<700x1616 sparse matrix of type '<class 'numpy.int64'>'
	with 6783 stored elements in Compressed Sparse Row format>

In [14]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
clf.score(X_test, y_test)

0.8066666666666666

In [19]:
for source in df_all.source.unique():
    df_source = df_all[df_all.source == source]
    
    sentences = df_source.sentence.values
    y = df_source.label.values

    sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size = 0.3, random_state=13)
    
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(sentences_train)
    X_test = vectorizer.transform(sentences_test)
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    
    print(f'Accuracy for {source} data: {clf.score(X_test, y_test):.4f}')

Accuracy for yelp data: 0.8067
Accuracy for amazon data: 0.7867
Accuracy for imdb data: 0.7378
