In [1]:
import pandas as pd  
import numpy as np

df_1 = pd.read_csv('clean_reviews.csv')
df_2 = pd.read_csv('clean_reviews_2.csv')
df = pd.concat([df_1, df_2], axis=0)

df.reset_index(inplace=True)
del df['index']
del df['Unnamed: 0']

df['target'] = df['target'].astype(str)
df = df[df['target'].apply(lambda x: len(x) == 1)]
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)

df['target'] = df['target'].astype(int)

df = df.sort_values(by=['target'])
df.reset_index(inplace=True)
del df['index']

df.to_csv('data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74920 entries, 0 to 74919
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    74920 non-null  object
 1   target  74920 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


# Dataset felosztása

In [2]:
from sklearn.model_selection import train_test_split

x = df.text
y = df.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)

In [3]:
print ("Train set: \n Total: {0} \n negative: {1:.2f}% \n positive: {2:.2f}%".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("\n")

print ("Test set: \n Total: {0} \n negative: {1:.2f}% \n positive: {2:.2f}% ".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

Train set: 
 Total: 59936 
 negative: 50.05% 
 positive: 49.95%


Test set: 
 Total: 14984 
 negative: 49.79% 
 positive: 50.21% 


In [4]:
train_df = pd.concat([x_train, y_train], axis=1).sort_values('target').reset_index()
del train_df['index']

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59936 entries, 0 to 59935
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    59936 non-null  object
 1   target  59936 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 936.6+ KB


# Szótár alapú megközelítés

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(max_features=45000, stop_words='english')
cvec.fit(x_train)

CountVectorizer(max_features=45000, stop_words='english')

In [7]:
len(cvec.get_feature_names())

45000

In [8]:
## Szavak -> mátrix

neg_mat = cvec.transform(train_df[train_df.target == 0].text)
pos_mat = cvec.transform(train_df[train_df.target == 1].text)

neg_term_freq = np.sum(neg_mat,axis=0)
pos_term_freq = np.sum(pos_mat,axis=0)

neg = np.squeeze(np.asarray(neg_term_freq))
pos = np.squeeze(np.asarray(pos_term_freq))

term_freq = pd.DataFrame([neg,pos],columns=cvec.get_feature_names()).transpose()

corp_mat = cvec.transform(train_df.text)

In [9]:
train_df[train_df.target == 0].tail()

Unnamed: 0,text,target
29994,in the first one it was mainly giant rats but ...,0
29995,poor second string feature from universal pict...,0
29996,mccabe and mrs miller takes place in the turn ...,0
29997,riggs and murtough are back but the magic of t...,0
29998,this is blatantly futuristic adaptation of jul...,0


In [10]:
## negativ szavak elődordulása

neg_batches = np.linspace(0,29929,10).astype(int)

i=0

neg_tf = []

while i < len(neg_batches)-1:
    batch_result = np.sum(corp_mat[neg_batches[i]:neg_batches[i+1]].toarray(),axis=0)
    neg_tf.append(batch_result)
    i += 1

In [11]:
train_df[train_df.target == 1].tail()

Unnamed: 0,text,target
59931,generally love these mystery police charlie ch...,1
59932,film that deserved theatrical release this mad...,1
59933,as long as you go into this movie knowing that...,1
59934,was very impressed with this small independent...,1
59935,ever sense was kid have loved this movie have ...,1


In [12]:
## pozitiv szavak elődordulása

pos_batches = np.linspace(29975,59936,10).astype(int)

i=0

pos_tf = []

while i < len(pos_batches)-1:
    batch_result = np.sum(corp_mat[pos_batches[i]:pos_batches[i+1]].toarray(),axis=0)
    pos_tf.append(batch_result)
    i += 1

In [13]:
## előfordulások összegzése

neg = np.sum(neg_tf,axis=0)
pos = np.sum(pos_tf,axis=0)

## előfordulási dataframe

term_freq_df = pd.DataFrame([neg,pos],columns=cvec.get_feature_names()).transpose()
term_freq_df.head()

Unnamed: 0,0,1
aa,13,8
aaa,8,7
aaargh,5,2
aag,37,2
aage,0,5


In [14]:
term_freq_df.columns = ['negative', 'positive']
term_freq_df['total'] = term_freq_df['negative'] + term_freq_df['positive']
term_freq_df.sort_values(by='total', ascending=False).iloc[:10]

Unnamed: 0,negative,positive,total
movie,59635,45807,105442
film,45295,49794,95089
like,26863,21309,48172
just,25303,17001,42304
good,17738,18298,36036
time,14800,15444,30244
really,14935,13037,27972
story,12173,15707,27880
bad,17723,4508,22231
did,12732,9324,22056


In [15]:
## Polarity = pos / total

from scipy.stats import hmean
term_freq_df['pos_rate'] = term_freq_df['positive'] * 1./term_freq_df['total']

term_freq_df

Unnamed: 0,negative,positive,total,pos_rate
aa,13,8,21,0.380952
aaa,8,7,15,0.466667
aaargh,5,2,7,0.285714
aag,37,2,39,0.051282
aage,0,5,5,1.000000
...,...,...,...,...
zuniga,17,10,27,0.370370
zurer,2,8,10,0.800000
zwart,1,6,7,0.857143
zwick,10,7,17,0.411765


# Szótár elkészítése

In [27]:
vocab_pred = []

terms = cvec.get_feature_names()
positive_rates = term_freq_df['pos_rate']
sentiment = ["positive" if i > 0.51 else "negative" for i in positive_rates]

vocabulary_df = pd.DataFrame(columns=["terms","sentiment"])

vocabulary_df['terms'] = terms
vocabulary_df['polarity'] = positive_rates.values.tolist()

vocabulary_df['sentiment'] = sentiment

from tabulate import tabulate

print(tabulate(vocabulary_df.sample(20), headers='keys', tablefmt='psql'))

vocabulary_dict = vocabulary_df.to_dict('index')


+-------+---------------+-------------+------------+
|       | terms         | sentiment   |   polarity |
|-------+---------------+-------------+------------|
|  6206 | celibacy      | negative    |   0.428571 |
| 15362 | freaked       | negative    |   0.452381 |
| 38357 | streetfighter | negative    |   0        |
| 33589 | ritt          | positive    |   1        |
| 14995 | fondness      | positive    |   0.725806 |
| 21910 | klemper       | positive    |   1        |
| 44298 | wladyslaw     | positive    |   1        |
| 27424 | notches       | negative    |   0.130435 |
|  1382 | angled        | negative    |   0.5      |
| 26764 | narcolepsy    | negative    |   0        |
|  4146 | bluff         | positive    |   0.588235 |
| 38377 | stretcher     | negative    |   0.272727 |
| 10314 | destruction   | positive    |   0.588235 |
|  6102 | catscratch    | positive    |   1        |
|  5628 | canceled      | positive    |   0.630137 |
| 12126 | eatery        | positive    |   1   

In [33]:
import json, pprint

json = json.dumps(vocabulary_dict)
f = open("vocabulary.json","w")
f.write(pprint.pformat(json))
f.close()

# Tesztelés

In [17]:
pos_hmean = term_freq_df.pos_rate

In [18]:
test_df = pd.concat([x_test, y_test], axis=1).sort_values('target').reset_index()
del test_df['index']

In [19]:
x = test_df.text
y_val_predicted_proba = []
for t in x:
    hmean_scores = [pos_hmean[w] for w in t.split() if w in pos_hmean.index]
    if len(hmean_scores) > 0:
        prob_score = np.mean(hmean_scores)
    else:
        prob_score = np.random.random()
    y_val_predicted_proba.append(prob_score)

In [20]:
pred = [1 if t > 0.51 else 0 for t in y_val_predicted_proba]

y = test_df.target
from sklearn.metrics import accuracy_score
print("Accuracy: " + str(round(accuracy_score(y,pred),2)*100) + "%")

Accuracy: 87.0%


In [21]:
test_df['predicted'] = pred
test_df['porality'] = y_val_predicted_proba

In [22]:
test_df.to_csv('test_results.csv')