In [1]:
import os
import sys
import pandas as pd
import sklearn.metrics as mt
import json
import regex as re
import multiprocessing
import time
import math
from collections import defaultdict
from nltk.tokenize import sent_tokenize
from pymystem3 import Mystem
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

In [2]:
def pos_bi(text, m):
    pos_tags = []
    sents = sent_tokenize(text)
    for sent in sents:
        sent_an = []
        analy = m.analyze(sent)
        for x in analy:
            try:
                if 'analysis' in x.keys():
                    tag = x['analysis'][0]['gr']
                    sent_an.append(re.sub(r'[=|,].*', '', tag).lower())
            except IndexError:
                pass
        pos_tags.append(sent_an)
    return pos_tags

In [3]:
def create_corp(texts, m):
    text_all = []
    for text in texts:
        text_all.append(pos_bi(text, m))
    return text_all
        

In [4]:
def calc_bigrams(corp):
    bis = defaultdict(int)
    uni = defaultdict(int)
    bis_text = defaultdict(int)
    pmis = defaultdict(float)
    total = 0
    for text in corp: #calc bigram totals, text totals, and total words
        text_c = []
        for sent in text:
            total += len(sent)
            for x in range(len(sent)):
                uni[sent[x]] += 1
                if x < len(sent)-1:
                    bi = "{0}:{1}".format(sent[x], sent[x+1])
                    bis[bi] += 1
                    if bi not in text_c:
                        text_c.append(bi)
        for n in text_c:
            bis_text[n] += 1
    for k,v in bis_text.items(): #remove if in less than 75% of texts
        if v/226 < 0.75:
            del bis[k]
    for k,v in bis.items():    #calc pmi of pos pairs
        pos1, pos2 = k.split(':')
        p1v = uni[pos1]
        p2v = uni[pos2]
        ptg = v
        pmi = math.log((ptg/p1v*p2v), 2)
        pmis[k] = pmi
    return pmis, total

In [5]:
def pmi_count_over_total(m, bi, pmi, text):
    poses = pos_bi(text, m)
    pos1, pos2 = bi.split(':')
    count = 0
    total = len(poses)
    for sent in poses:
        for x in range(len(sent)-1):
            if sent[x] == pos1 and sent[x+1] == pos2:
                count +=1
    stat = count/total
    return stat

In [17]:
def imp (filenames, root):
    sheets =[]
    for f in filenames:
        if f != 'sex.csv' and f != 'new_info.csv':
            df = pd.read_csv(os.path.join(root, f), index_col=0)
            sheets.append(df)
        elif f == 'sex.csv':
            df2 = pd.read_csv(os.path.join(root, f))
        else:
            df3 = pd.read_csv(os.path.join(root, f))
    comb = pd.concat(sheets, axis=1)
    new_vec = list((1,0)*113)
    comb['Truth'] = new_vec
    comb['Sex'] = ''
    comb['Avg_sent'], comb['TTR'], comb['Avg_word'], comb['Hapax'], comb['Yules'] = 0,0,0,0,0
    m = Mystem()
    lie_corp = df3[df3['Truth'] == 1]
    true_corp = df3[df3['Truth'] == 0]
    lies = create_corp(lie_corp['Text'], m)
    truths = create_corp(true_corp['Text'], m)
    corp = lies + truths
    pmis, total = calc_bigrams(corp)
    for num in df2['ID']:
        num2 = num - 1
        comb.loc['{0}Л.docx'.format(num), 'Sex'] = df2.ix[num2, 'Пол']
        comb.loc['{0}П.docx'.format(num), 'Sex'] = df2.ix[num2, 'Пол']
    for index, row in df3.iterrows():
        r = int(row['ID'])
        text = str(row['Text'])
        if row['Truth'] == 1:
            comb.loc['{0}Л.docx'.format(r), ['Avg_sent']] = row['Avg_sent']
            comb.loc['{0}Л.docx'.format(r), ['TTR']] = row['TTR']
            comb.loc['{0}Л.docx'.format(r), ['Avg_word']] = row['Avg_word']
            comb.loc['{0}Л.docx'.format(r), ['Hapax']] = row['Hapax']
            comb.loc['{0}Л.docx'.format(r), ['Yules']] = row['Yules']
            for k,v in pmis.items():
                comb[k] = 0
                comb.loc['{0}Л.docx'.format(r), [k]] = pmi_count_over_total(m, k, v, row['Text'])
                
        else:
            comb.loc['{0}П.docx'.format(r), ['Avg_sent']] = row['Avg_sent']
            comb.loc['{0}П.docx'.format(r), ['TTR']] = row['TTR']
            comb.loc['{0}П.docx'.format(r), ['Avg_word']] = row['Avg_word']
            comb.loc['{0}П.docx'.format(r), ['Hapax']] = row['Hapax']
            comb.loc['{0}П.docx'.format(r), ['Yules']] = row['Yules']
            for k,v in pmis.items():
                comb[k] = 0
                comb.loc['{0}Л.docx'.format(r), [k]] = pmi_count_over_total(m, k, v, row['Text'])
    return comb

In [13]:
def log_reg(df, labs):
    train = df.sample(frac=0.7, random_state=1)
    test = df.loc[~df.index.isin(train.index)]
    xtrain = train[labs]
    xtest = test[labs]
    ytrain = train['Truth']
    ytest = test['Truth']
    reg = linear_model.LogisticRegressionCV()
    reg.fit(xtrain,ytrain)
    preds = reg.predict(xtest)
    print(reg.coef_)
    print(mean_squared_error(preds, ytest))
    print(reg.score(xtest, ytest))
    print(mt.r2_score(ytest, preds))
    print(mt.classification_report(ytest, preds, target_names=['Truth', 'Lie']))
    print(mt.accuracy_score(ytest, preds))
    print(mt.confusion_matrix(ytest, preds))


In [14]:
def log_sex(df, labs):
    mal = df[df['Sex'] == 'муж.']
    fem = df[df['Sex'] == 'жен.']

    print(len(mal))
    print(len(fem))

    train_mal = mal.sample(frac=0.6, random_state=1)
    train_fem = fem.sample(frac=0.6, random_state=1)

    test_mal = df.loc[~mal.index.isin(train_mal.index)]
    test_fem = df.loc[~fem.index.isin(train_fem.index)]

    xtrain_mal = train_mal[labs]
    xtrain_fem = train_fem[labs]
    xtest_mal = test_mal[labs]
    xtest_fem = test_fem[labs]

    ytrain_mal = train_mal['Truth']
    ytrain_fem = train_fem['Truth']
    ytest_mal = test_mal['Truth']
    ytest_fem = test_fem['Truth']

    reg_mal = linear_model.LogisticRegressionCV()
    reg_fem = linear_model.LogisticRegressionCV()

    reg_mal.fit(xtrain_mal, ytrain_mal)
    preds_mal = reg_mal.predict(xtest_mal)

    reg_fem.fit(xtrain_fem, ytrain_fem)
    preds_fem = reg_fem.predict(xtest_fem)

    print(reg_mal.coef_)
    print(mean_squared_error(preds_mal, ytest_mal))
    print(reg_mal.score(xtest_mal, ytest_mal))
    print(mt.r2_score(ytest_mal, preds_mal))
    print(mt.classification_report(ytest_mal, preds_mal, target_names=['Truth', 'Lie']))
    print(mt.confusion_matrix(ytest_mal, preds_mal))

    print(reg_fem.coef_)
    print(mean_squared_error(preds_fem, ytest_fem))
    print(reg_fem.score(xtest_fem, ytest_fem))
    print(mt.r2_score(ytest_fem, preds_fem))
    print(mt.classification_report(ytest_fem, preds_fem, target_names=['Truth', 'Lie']))
    print(mt.confusion_matrix(ytest_fem, preds_fem))

In [18]:
for root, dirs, files in os.walk('russian_deception_bank/tables/'):
    f = files
results = imp(f, root)

In [19]:
results.to_csv('allinfo.csv', delimiter=",")

In [10]:
a = list(results)
a = list(filter(lambda x: x != 'Truth' and x != 'Sex' and x != 'Segment', a))
print(a)
print(results)
log_reg(results, a)
log_sex(results, a)

['и', 'Личноеместоимение', 'вербальные', 'Предлог', 'Союз', 'Когнитив', 'Включение', 'AllPunc', 'мест.-сущ.', 'добавление', 'Avg_sent', 'TTR', 'Avg_word', 'Hapax', 'Yules', 'pr:s', 'conj:spro', 's:v', 'adv:v', 'spro:v', 'a:s', 'v:spro', 'pr:spro', 'pr:apro', 's:spro', 'v:s', 'v:adv', 's:s', 'v:pr', 'apro:s', 's:pr', 'v:conj', 'v:v', 'part:v', 'conj:v', 's:conj']
[[  1.25372305e-03   6.26983322e-04   1.84935688e-03   1.92192455e-04
    2.21533709e-03   3.04824868e-03   2.22530755e-03  -3.54849318e-03
    7.06015044e-04   1.25141845e-03   1.21816268e-03  -5.54807004e-06
    1.53655042e-05   1.22745869e-06  -7.92029513e-05   5.32682999e-05
    4.09756153e-06   1.63902461e-05   1.22926846e-05   6.55609845e-05
    3.27804922e-05   1.22926846e-05   1.63902461e-05   8.19512306e-06
    2.45853692e-05   2.04878077e-05   4.09756153e-06   1.63902461e-05
    4.09756153e-05   3.68780538e-05   4.09756153e-05   2.45853692e-05
    4.09756153e-05   1.22926846e-05   8.19512306e-06   1.63902461e-05]]
0.4