In [27]:
import csv
import json
import os
import re

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
import tqdm

In [28]:
FILE_NAME = 'sentiments.json'
CSV_FOLDER = 'data'
csv.field_size_limit(2147483647)

2147483647

In [29]:
def gen_dict_extract(key, var):
    if hasattr(var, 'items'):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, dict):
                for result in gen_dict_extract(key, v):
                    yield result
            elif isinstance(v, list):
                for d in v:
                    for result in gen_dict_extract(key, d):
                        yield result

In [30]:
def read_csv( path ):
  try:
    with open(path, encoding='utf-8') as data:
      reader = csv.DictReader(data)
      for row in reader:
        yield row
  except IOError:
    print("ERROR: file [" + path + "] not found/accessible")
    yield ""
    return
  data.close()

In [31]:
with open(FILE_NAME) as f:
        sentiments = json.load(f)

In [32]:
file_idx = {}
for sentiment in sentiments:
    if sentiment['file_name'] not in file_idx.keys():
        file_idx[sentiment['file_name']] = []
    file_idx[sentiment['file_name']].append(sentiment['file_idx'])

In [33]:
texts_to_vectorize = []
predicted_sentiments = []
for file_name in tqdm.tqdm(file_idx.keys()):
    file_idx[file_name].sort()
    csv_path = os.path.join(CSV_FOLDER, file_name)
    for idx, row in enumerate(read_csv(csv_path)):
        if idx > file_idx[file_name][-1]:
            break
        elif idx not in file_idx[file_name]:
            continue
        try:
            tx = json.loads(row["tx"])
        except Exception as e:
            print("ERROR: failed to parse row #" + str(idx) + " from [" + csv_path + "]")
            break
        lang = str(row["la"])
        if lang == 'en':
            continue
        whole_text = ""
        for tx_elem in tx:
            for elem in gen_dict_extract('text', tx_elem):
                whole_text += elem
        whole_text.replace('\t', '')
        whole_text = ' '.join(whole_text.split())
        if len(whole_text) == 0:
            continue
        language = 'german'
        tokenizer = ToktokTokenizer()
        text_tokens = tokenizer.tokenize(whole_text)
        pattern = re.compile('[\W_]+')
        text_tokens = [pattern.sub('', x) for x in text_tokens]
        text_tokens = [x for x in text_tokens if x.lower() not in stopwords.words(language) and x.isalnum()]
        stemmer = SnowballStemmer(language)
        text_tokens = [stemmer.stem(x) for x in text_tokens]
        texts_to_vectorize.append(' '.join(text_tokens))
        matching_sentiment = [x for x in sentiments if x['file_name'] == file_name and x['file_idx'] == idx][0]
        predicted_sentiments.append(matching_sentiment['sentiment'])

100%|██████████| 3/3 [12:24<00:00, 248.04s/it]


In [34]:
print(len(predicted_sentiments))
print(len(texts_to_vectorize))

13258
13258


In [35]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts_to_vectorize)
predicted_sentiments = np.array(predicted_sentiments)

In [36]:
# k_fold = StratifiedKFold(n_splits=3, shuffle=True)
# splits = k_fold.split(vectors, predicted_sentiments)

In [37]:
# print(splits)

In [38]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.neural_network import MLPClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.naive_bayes import GaussianNB

In [39]:
cls = RandomForestClassifier()

In [40]:
# for train_index, test_index in splits:
#     X_train, X_test = vectors[train_index], vectors[test_index]
#     y_train, y_test = predicted_sentiments[train_index], predicted_sentiments[test_index]
#     cls.fit(X=X_train, y=y_train)
#     print(cls.score(X_test, y_test))

In [41]:
from sklearn.model_selection import cross_validate
scores = ["precision_micro", "precision_macro", "precision_weighted", "recall_micro", "recall_macro",
              "recall_weighted", "f1_micro", "f1_macro", "f1_weighted", "accuracy"]

In [42]:
# validation = cross_validate(cls, X=vectors, y=predicted_sentiments, cv=k_fold, scoring=scores,
#                               return_train_score=True)
# for k, v in validation.items():
#     print(f'{k}: {v}')

In [43]:
cls.fit(X=vectors, y=predicted_sentiments)

RandomForestClassifier()

In [44]:
import pickle

In [46]:
pickle.dump(vectorizer, open("vocabulary.tfidf","wb"))
pickle.dump(cls, open("rf.mdl", 'wb'))