# 1. Основы слов, в качестве признаков

In [1]:
import os,sys
import pandas as pd
import numpy as np
from collections import Counter
import nltk
import string
import random
from nltk.corpus import stopwords
import nltk.tokenize as word_tokenize
from tqdm import tqdm

from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("russian") # Choose a language

In [2]:
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [4]:
import re 
import emoji

## Разбивка на выборку (кросс валидация)

In [5]:
from get_fold import get_fold

In [6]:
def getTrainTest(seed = 42):
    X_train = pd.concat([get_fold(K_fold=i,seed = seed) for i in range(7)])
    X_test  = pd.concat([get_fold(K_fold=i,seed = seed) for i in range(7,10)])
    return X_train, X_test

## Логи

In [7]:
metrics = ['accuracy_score', 'f1_score', 'precision_score', 'recall_score']
types =   ['0/1/-1', '0/1,-1','1/0,-1','-1/0,1']

def make_log(y_predict, y_test, seed = 42):
    text =  "\n\nseed\t"+str(seed)
    text += "\n'3 class classification\t0/1/-1"
    text += "\naccuracy_score\t"+str(accuracy_score(y_test, y_predict))
    text += "\nf1_score\t"+str(f1_score(y_test, y_predict, average = 'micro'))
    text += "\nprecision_score\t"+str(precision_score(y_test, y_predict, average = 'micro'))
    text += "\nrecall_score\t"+str(recall_score(y_test, y_predict, average = 'micro'))
    
    text += "\n2 class classification\t0/1,-1"
    lbl = 0
    text += "\naccuracy_score\t"+str(accuracy_score([i==lbl for i in  y_test], [i==lbl for i in  y_predict]))
    text += "\nf1_score\t"+str(f1_score([i==lbl for i in  y_test], [i==lbl for i in  y_predict]))
    text += "\nprecision_score\t"+str(precision_score([i==lbl for i in  y_test], [i==lbl for i in  y_predict]))
    text += "\nrecall_score\t"+str(recall_score([i==lbl for i in  y_test], [i==lbl for i in  y_predict]))

    
    text += "\n2 class classification\t1/0,-1"
    lbl = 1
    text += "\naccuracy_score\t"+str(accuracy_score([i==lbl for i in  y_test], [i==lbl for i in  y_predict]))
    text += "\nf1_score\t"+str(f1_score([i==lbl for i in  y_test], [i==lbl for i in  y_predict]))
    text += "\nprecision_score\t"+str(precision_score([i==lbl for i in  y_test], [i==lbl for i in  y_predict]))
    text += "\nrecall_score\t"+str(recall_score([i==lbl for i in  y_test], [i==lbl for i in  y_predict]))

    text += "\n2 class classification\t-1/0,1"
    lbl = -1
    text += "\naccuracy_score\t"+str(accuracy_score([i==lbl for i in  y_test], [i==lbl for i in  y_predict]))
    text += "\nf1_score\t"+str(f1_score([i==lbl for i in  y_test], [i==lbl for i in  y_predict]))
    text += "\nprecision_score\t"+str(precision_score([i==lbl for i in  y_test], [i==lbl for i in  y_predict]))
    text += "\nrecall_score\t"+str(recall_score([i==lbl for i in  y_test], [i==lbl for i in  y_predict]))
   
    text += "\nmatrix_cross_valid\n"
    M = np.zeros((3,3), dtype = int)
    for i in zip(y_test, y_predict):
        t1, t2 = i
        if i[0] == -1:
            t1 = 2
        if i[1] == -1:
            t2 = 2
        M[t1, t2] += 1
    text += str(M[0,0])+','+str(M[0,1])+','+str(M[0,2])+'\n'
    text += str(M[1,0])+','+str(M[1,1])+','+str(M[1,2])+'\n'
    text += str(M[2,0])+','+str(M[2,1])+','+str(M[2,2])+'\n'

    return text

## Случайное распределение с той же статистикой

In [9]:
LOG_random = ""
for e in tqdm(range(50)):
    X_train, X_test = getTrainTest(seed = e)
    y_test = list(X_test['label'])
    random.seed(e)
    y_predict = random.sample([0]*y_test.count(0)+[1]*y_test.count(1)+[-1]*y_test.count(-1),len(y_test))
    LOG_random+=make_log(y_test, y_predict, seed=e)
open('logs/Randomlogs.txt','w+').write(LOG_random)

## Нулевое распределение

In [11]:
LOG_zero = ""
for e in tqdm(range(1)):
    X_train, X_test = getTrainTest(seed = e)
    y_test = list(X_test['label'])
    random.seed(e)
    y_predict = random.sample([0]*len(y_test), len(y_test))
    LOG_zero+=make_log(y_test=y_test, y_predict=y_predict, seed=e)
open('logs/Zerologs.txt','w+').write(LOG_zero)

## Предобработка данных

In [12]:
##############################
# parsing data
##############################
def decode(a, encoding="utf8"):
    if isinstance(a, bytes):
        return a.decode(encoding)
    else:
        return a

    return [decode(i) for i in tokens]

def tokenize_me(file_text):
    smiles = re.findall(emoji.get_emoji_regexp(), file_text)
    for w in smiles:
        file_text.replace(w,' ')
        
    tokens = nltk.word_tokenize(file_text, language='english')

    tokens = [i for i in tokens if ( i not in string.punctuation )]
    stop_words = stopwords.words('russian')
    stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])
    tokens = [stemmer.stem(i) for i in tokens if ( i not in stop_words )]+smiles

    tokens = [i.replace(u"«", u"").replace(u"»", u"") for i in tokens]
    return tokens

def get_words_matrix(df):
    all_words = []
    for i in tqdm(df['data'].get_values()):
        words = []
        words = tokenize_me(decode(i).lower())
        all_words.extend(words)   
    all_words = Counter(all_words)
    all_words = dict(filter(lambda x:x[1]>1 and x[1]<250, all_words.items()))
    print("Len: "+str(len(all_words)))
    return list(all_words.keys())

def get_X_matrix(df, all_words, show_progress = False, dtype = 1):
    X_train = np.zeros((len(df), len(all_words)))
    for c,i in tqdm(enumerate(df['data'].get_values())):
        words = []
        words = tokenize_me(decode(i).lower())
        for w in words:
            if w in all_words:
                ind = all_words.index(w)
                X_train[c, ind] += 1
    return X_train      

In [13]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD

# Generate dummy data
import numpy as np
import json
import os
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt

# import warnings                        #Костыль
# warnings.filterwarnings("ignore")

def get_label(lst):
    ind = lst.argmax()
    if ind == 0:
        return 0
    elif ind == 1:
        return 1
    elif ind == 2:
        return -1
    
def NN(X_train, y_train, X_test, y_test, verbose = 0, epochs = 50):
	N_out = 3
	N_in  = X_train.shape[1]
	X_all = np.concatenate((X_train,X_test))
	y_all = keras.utils.to_categorical(np.concatenate((y_train,y_test)), num_classes=N_out)
	model = Sequential()
	model.add(Dense(512, activation='relu', input_dim=N_in))
# 	model.add(Dropout(0.5))
	model.add(Dense(128, activation='relu'))
# 	model.add(Dropout(0.5))
	model.add(Dense(N_out, activation='softmax'))
	sgd = SGD(lr=0.01, decay=1e-6, momentum=0.8, nesterov=True)
	model.compile(loss='categorical_crossentropy',
	              optimizer=sgd,
	              metrics=['accuracy'])

	##### Draw model graph
# 	from keras.utils.vis_utils import plot_model  
# 	plot_model(model, to_file='model.png', show_shapes=True)  
	######################


	history = model.fit(X_all,
	 	 		 	 	y_all, 
	 	 	 	 	 	validation_split = 0.3,
	 	 	 	 	 	epochs=epochs,
	 	 	 	 	 	batch_size=64,
	 	 	 	 	 	verbose=verbose,
	 	 	 	 	 	shuffle=False)
	print('===')  
	y_predict = [get_label(k) for k in model.predict(X_test, batch_size=32, verbose=0)]
	return y_predict  

Using TensorFlow backend.


In [14]:
def LogisticRegressionScore(X_train, y_train, X_test, y_test):
    regr = LogisticRegression(C = 0.4)
    regr.fit(X_train, y_train)
    y_predict = regr.predict(X_test)
    temp = make_log(y_predict, y_test)
    return temp
def SVMScore(X_train, y_train, X_test, y_test):
    SVM_model = LinearSVC(C = 0.05)
    SVM_model.fit(X_train, y_train)
    y_predict = SVM_model.predict(X_test)
    temp = make_log(y_predict, y_test)
    return temp

def NNScore(X_train, y_train, X_test, y_test, verbose = 1, epochs = 10):
    y_predict = NN(X_train, y_train, X_test, y_test, verbose = verbose, epochs=epochs)
    temp = make_log(y_predict, y_test)
    return temp

def LR_SVM_NN_logs(X_train, y_train, X_test, y_test):
    return (LogisticRegressionScore(X_train, y_train, X_test, y_test), 
            SVMScore(X_train, y_train, X_test, y_test), 
            NNScore(X_train, y_train, X_test, y_test)
            )

In [16]:
SVM_log    = ""
LOGREG_log = ""
NN_log     = ""
for e in range(100):
    df_train, df_test  = getTrainTest(seed = e)
    dtype = 2
    all_words = get_words_matrix(df_train)
    print('Train')
    X_train = get_X_matrix(df_train, all_words, show_progress = True)
    print('Test')
    X_test  = get_X_matrix(df_test, all_words, show_progress = True)
    y_train = df_train['label']
    y_test  = df_test['label']
    
    print('LogReg')
    LOGREG_log += LogisticRegressionScore(X_train, y_train, X_test, y_test)
    
    print("SVM")
    SVM_log += SVMScore(X_train, y_train, X_test, y_test)
    NN_log += NNScore(X_train, y_train, X_test, y_test, verbose=0, epochs=13)

    open('logs/LogReglogs.txt','w+').write(LOGREG_log)
    open('logs/SVMlogs.txt','w+').write(SVM_log)
    open('logs/NNlogs.txt','w+').write(NN_log)