In [1]:
import pandas as pd
import re

In [2]:
all_verbs = pd.read_csv('./all_verbs_naive.csv').reset_index(drop = True)
all_verbs

Unnamed: 0,Tense,Modality,Aspect,Causation,Polarity,Person,Voice,Transitivity,Generated Verb-UR,Generated Verb-SR,Stressed Portion
0,present,indicative,imperfective,non-causative,negative,1s,passive,intrans,a-me-*ne*-ger-i-em(ɛ),amneˈgerijemɛ,e
1,present,indicative,imperfective,non-causative,negative,2s,passive,intrans,a-me-*ne*-ger-i-i,amneˈgerii,e
2,present,indicative,imperfective,non-causative,negative,3sm,passive,intrans,a-me-*ne*-ger-i-e,amneˈgerije,e
3,present,indicative,imperfective,non-causative,negative,3sf,passive,intrans,a-me-*ne*-ger-i-iɛ,amneˈgeriijɛ,e
4,present,indicative,imperfective,non-causative,negative,1p,passive,intrans,a-me-*ne*-ger-i-om(ɛ),amneˈgerijomɛ,e
...,...,...,...,...,...,...,...,...,...,...,...
44107,past,subjunctive,pluperfect,non-causative,affirmative,3sm,active,trans,ʌ-χʌst-*abiʃ(ɛ)*,ʌχʌstɛˈbiʃɛ,ɛ
44108,past,subjunctive,pluperfect,non-causative,affirmative,3sf,active,trans,ʌ-χʌst-*abiʃ(ɛ)*,ʌχʌstɛˈbiʃɛ,ɛ
44109,past,subjunctive,pluperfect,non-causative,affirmative,1p,active,trans,ʌ-χʌst-*abimon*,ʌχʌstɛˈbimon,ɛ
44110,past,subjunctive,pluperfect,non-causative,affirmative,2p,active,trans,ʌ-χʌst-*abion*,ʌχʌstɛˈbijon,ɛ


In [3]:
all_verbs['Generated Verb-SR'].apply(len).max() # getting the max length of cells in a column in a df

25

In [4]:
len(all_verbs[all_verbs['Tense']== 'present'])

10400

In [5]:
len(all_verbs[all_verbs['Tense']== 'past'])

33712

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


def logistic_regression_models():
    features = ['Tense', 'Modality', 'Aspect', 'Causation', 'Polarity', 'Person', 'Voice', 'Transitivity']
    model_info = {}
    for feature in features:


        model = LogisticRegression(max_iter=100000) #max number of times the code runs to classify every object

        X = all_verbs['Generated Verb-SR'] #predictor variable
        y = all_verbs[feature] #predicted/target varuiable
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 1)


        # vectorizing the training data
        #use recurrent neural networks instead?
        count_vect = CountVectorizer(token_pattern= '.', ngram_range= (1,3)) # min single character and max 3 chas  
        X_train = count_vect.fit_transform(X_train)
        # 'fit' trains the vectorizer model so that it knows what what unique vectorize numbers will be
        # 'transform' outputs the vectorized values
        X_test = count_vect.transform(X_test) #no need to fit again, otherwise what it sees on training and test data may differ and end up with an erorr


        # encoding the training labels; this acts like getdummies but uses sklearn to make it easier
        le = LabelEncoder()
        y_train = le.fit_transform(y_train) #le.fit() and le.transform() together make le.fit_transform()
        y_test = le.transform(y_test)
        print(le.classes_)#['past' 'present] means past is assigned index 0 and present is assigned  1

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        matrix = confusion_matrix(y_test, y_pred)
        word_predict = model.predict(count_vect.transform(np.array(['ʌbɛmdinijɛst*ɛ*vijɛ'])))
        accuracy = accuracy_score(y_test, y_pred)
        model_info[feature] = { 'model': model,'accuracy': accuracy,'word prediction': word_predict, 'confusion matrix': matrix }
        
    return model_info


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
models = logistic_regression_models() # in the matrix, everything in the first row is related to imperfective

['past' 'present']
['imperative' 'indicative' 'subjunctive']
['imperfective' 'neutral' 'perfect' 'pluperfect']
['causative' 'non-causative']
['affirmative' 'negative']
['1p' '1s' '2p' '2s' '3p' '3sf' '3sm']
['active' 'passive']
['intrans' 'trans']


In [8]:
models #dict of dicts

{'Tense': {'model': LogisticRegression(max_iter=100000),
  'accuracy': 0.9999313045270317,
  'word prediction': array([0]),
  'confusion matrix': array([[11116,     0],
         [    1,  3440]])},
 'Modality': {'model': LogisticRegression(max_iter=100000),
  'accuracy': 0.8588994985230474,
  'word prediction': array([1]),
  'confusion matrix': array([[ 191,    0,   59],
         [   1, 7174,  718],
         [  53, 1223, 5138]])},
 'Aspect': {'model': LogisticRegression(max_iter=100000),
  'accuracy': 0.8406951981864396,
  'word prediction': array([3]),
  'confusion matrix': array([[3158,   18,    1,    0],
         [   1, 4989,  100,    0],
         [   0,  271, 1881,  989],
         [   0,    0,  939, 2210]])},
 'Causation': {'model': LogisticRegression(max_iter=100000),
  'accuracy': 0.9999313045270317,
  'word prediction': array([0]),
  'confusion matrix': array([[ 4488,     0],
         [    1, 10068]])},
 'Polarity': {'model': LogisticRegression(max_iter=100000),
  'accuracy': 0.9

In [9]:
models['Tense']['confusion matrix']
models['Aspect']['confusion matrix']

array([[3158,   18,    1,    0],
       [   1, 4989,  100,    0],
       [   0,  271, 1881,  989],
       [   0,    0,  939, 2210]])

In [10]:
pd.set_option('display.float_format', lambda x: '%.9f' % x) #suppress scientific notations

In [11]:
le = LabelEncoder()
# le.fit_transform(all_verbs[['Tense']])
all_verbs_encoded = all_verbs
all_verbs_encoded = all_verbs_encoded.apply(lambda col: le.fit_transform(col))

In [12]:
corr_matrix = all_verbs_encoded.corr()

In [13]:
corr_matrix

Unnamed: 0,Tense,Modality,Aspect,Causation,Polarity,Person,Voice,Transitivity,Generated Verb-UR,Generated Verb-SR,Stressed Portion
Tense,1.0,-0.031654208,-0.47235488,0.002018538,-0.0,-0.00789692,-0.027944259,0.021986224,-0.076969193,0.000487137,-0.136164889
Modality,-0.031654208,1.0,0.258745226,-0.003016283,0.0,0.011800295,0.04175685,-0.032853813,-0.180270875,0.025376034,0.09010106
Aspect,-0.47235488,0.258745226,1.0,-0.000462761,0.0,0.001810414,0.00640638,-0.005040467,-0.173173958,0.003590471,0.197336596
Causation,0.002018538,-0.003016283,-0.000462761,1.0,0.0,-0.000277531,-0.072234435,-0.07916691,-0.113290931,-0.10142292,0.026966116
Polarity,-0.0,0.0,0.0,0.0,1.0,-0.0,0.0,0.0,-0.184820488,0.355016355,-0.436927871
Person,-0.00789692,0.011800295,0.001810414,-0.000277531,-0.0,1.0,0.003842084,-0.003022908,0.004517105,0.001238193,-0.162099749
Voice,-0.027944259,0.04175685,0.00640638,-0.072234435,0.0,0.003842084,1.0,-0.786788578,0.05388874,0.032220837,0.065282936
Transitivity,0.021986224,-0.032853813,-0.005040467,-0.07916691,0.0,-0.003022908,-0.786788578,1.0,0.016646417,0.032908369,-0.073332363
Generated Verb-UR,-0.076969193,-0.180270875,-0.173173958,-0.113290931,-0.184820488,0.004517105,0.05388874,0.016646417,1.0,0.525853112,0.090379473
Generated Verb-SR,0.000487137,0.025376034,0.003590471,-0.10142292,0.355016355,0.001238193,0.032220837,0.032908369,0.525853112,1.0,-0.087109957


In [16]:
all_verbs.to_csv('./all_verbs_LR.csv')

In [17]:
all_verbs

Unnamed: 0,Tense,Modality,Aspect,Causation,Polarity,Person,Voice,Transitivity,Generated Verb-UR,Generated Verb-SR,Stressed Portion
0,present,indicative,imperfective,non-causative,negative,1s,passive,intrans,a-me-*ne*-ger-i-em(ɛ),amneˈgerijemɛ,e
1,present,indicative,imperfective,non-causative,negative,2s,passive,intrans,a-me-*ne*-ger-i-i,amneˈgerii,e
2,present,indicative,imperfective,non-causative,negative,3sm,passive,intrans,a-me-*ne*-ger-i-e,amneˈgerije,e
3,present,indicative,imperfective,non-causative,negative,3sf,passive,intrans,a-me-*ne*-ger-i-iɛ,amneˈgeriijɛ,e
4,present,indicative,imperfective,non-causative,negative,1p,passive,intrans,a-me-*ne*-ger-i-om(ɛ),amneˈgerijomɛ,e
...,...,...,...,...,...,...,...,...,...,...,...
44107,past,subjunctive,pluperfect,non-causative,affirmative,3sm,active,trans,ʌ-χʌst-*abiʃ(ɛ)*,ʌχʌstɛˈbiʃɛ,ɛ
44108,past,subjunctive,pluperfect,non-causative,affirmative,3sf,active,trans,ʌ-χʌst-*abiʃ(ɛ)*,ʌχʌstɛˈbiʃɛ,ɛ
44109,past,subjunctive,pluperfect,non-causative,affirmative,1p,active,trans,ʌ-χʌst-*abimon*,ʌχʌstɛˈbimon,ɛ
44110,past,subjunctive,pluperfect,non-causative,affirmative,2p,active,trans,ʌ-χʌst-*abion*,ʌχʌstɛˈbijon,ɛ
