In [47]:
import pandas as pd
import re

In [48]:
all_verbs = pd.read_csv('./all_verbs_naive.csv').reset_index(drop = True)
all_verbs

Unnamed: 0,Tense,Modality,Aspect,Causation,Polarity,Person,Voice,Transitivity,Generated Verb-UR,Generated Verb-SR,Stressed Portion
0,present,indicative,imperfective,non-causative,negative,1s,passive,intrans,a-me-*ne*-ger-i-em(ɛ),amneˈgerijemɛ,e
1,present,indicative,imperfective,non-causative,negative,2s,passive,intrans,a-me-*ne*-ger-i-i,amneˈgerii,e
2,present,indicative,imperfective,non-causative,negative,3sm,passive,intrans,a-me-*ne*-ger-i-e,amneˈgerije,e
3,present,indicative,imperfective,non-causative,negative,3sf,passive,intrans,a-me-*ne*-ger-i-iɛ,amneˈgeriijɛ,e
4,present,indicative,imperfective,non-causative,negative,1p,passive,intrans,a-me-*ne*-ger-i-om(ɛ),amneˈgerijomɛ,e
...,...,...,...,...,...,...,...,...,...,...,...
34475,past,subjunctive,perfect,non-causative,affirmative,3sm,active,trans,ʌ-χʌst-*abiʃ(ɛ)*,ʌχʌstɛˈbiʃɛ,ɛ
34476,past,subjunctive,perfect,non-causative,affirmative,3sf,active,trans,ʌ-χʌst-*abiʃ(ɛ)*,ʌχʌstɛˈbiʃɛ,ɛ
34477,past,subjunctive,perfect,non-causative,affirmative,1p,active,trans,ʌ-χʌst-*abimon*,ʌχʌstɛˈbimon,ɛ
34478,past,subjunctive,perfect,non-causative,affirmative,2p,active,trans,ʌ-χʌst-*abion*,ʌχʌstɛˈbijon,ɛ


In [49]:
all_verbs['Generated Verb-SR'].apply(len).max() # getting the max length of cells in a column in a df

25

In [50]:
len(all_verbs[all_verbs['Tense']== 'present'])

10400

In [51]:
len(all_verbs[all_verbs['Tense']== 'past'])

24080

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


def logistic_regression_models():
    features = ['Tense', 'Modality', 'Aspect', 'Causation', 'Polarity', 'Person', 'Voice', 'Transitivity']
    model_info = {}
    for feature in features:


        model = LogisticRegression(max_iter=100000) #max number of times the code runs to classify every object # add C as an argument

        X = all_verbs['Generated Verb-SR'] #predictor variable
        y = all_verbs[feature] #predicted/target varuiable
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 1)


        # vectorizing the training data
        #use recurrent neural networks instead?
        count_vect = CountVectorizer(token_pattern= '.', ngram_range= (1,3)) # min single character and max 3 chas  
        X_train = count_vect.fit_transform(X_train)
        # 'fit' trains the vectorizer model so that it knows what what unique vectorize numbers will be
        # 'transform' outputs the vectorized values
        X_test = count_vect.transform(X_test) #no need to fit again, otherwise what it sees on training and test data may differ and end up with an erorr


        # encoding the training labels; this acts like getdummies but uses sklearn to make it easier
        le = LabelEncoder()
        y_train = le.fit_transform(y_train) #le.fit() and le.transform() together make le.fit_transform()
        y_test = le.transform(y_test)
        print(le.classes_)#['past' 'present] means past is assigned index 0 and present is assigned  1

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        matrix = confusion_matrix(y_test, y_pred)
        word_predict = model.predict(count_vect.transform(np.array(['ʌbɛmdinijɛst*ɛ*vijɛ'])))
        accuracy = accuracy_score(y_test, y_pred)
        model_info[feature] = { 'model': model,'accuracy': accuracy,'word prediction': word_predict, 'confusion matrix': matrix }
        
    return model_info


In [53]:
from sklearn.feature_extraction.text import CountVectorizer
models = logistic_regression_models() # in the matrix, everything in the first row is related to imperfective

['past' 'present']
['imperative' 'indicative' 'subjunctive']
['imperfective' 'neutral' 'perfect' 'pluperfect']
['causative' 'non-causative']
['affirmative' 'negative']
['1p' '1s' '2p' '2s' '3p' '3sf' '3sm']
['active' 'passive']
['intrans' 'trans']


In [54]:
models #dict of dicts

{'Tense': {'model': LogisticRegression(max_iter=100000),
  'accuracy': 0.9995605940768082,
  'word prediction': array([0]),
  'confusion matrix': array([[7959,    3],
         [   2, 3415]])},
 'Modality': {'model': LogisticRegression(max_iter=100000),
  'accuracy': 0.9912118815361631,
  'word prediction': array([1]),
  'confusion matrix': array([[ 186,    0,   42],
         [   5, 7927,    1],
         [  50,    2, 3166]])},
 'Aspect': {'model': LogisticRegression(max_iter=100000),
  'accuracy': 0.973547763423851,
  'word prediction': array([3]),
  'confusion matrix': array([[3090,   11,    0,    0],
         [  11, 3364,  115,    0],
         [   0,  164, 3012,    0],
         [   0,    0,    0, 1612]])},
 'Causation': {'model': LogisticRegression(max_iter=100000),
  'accuracy': 0.9999121188153617,
  'word prediction': array([0]),
  'confusion matrix': array([[3579,    0],
         [   1, 7799]])},
 'Polarity': {'model': LogisticRegression(max_iter=100000),
  'accuracy': 0.9991211881

In [55]:
models['Tense']['confusion matrix']
models['Aspect']['confusion matrix']

array([[3090,   11,    0,    0],
       [  11, 3364,  115,    0],
       [   0,  164, 3012,    0],
       [   0,    0,    0, 1612]])

In [56]:
pd.set_option('display.float_format', lambda x: '%.9f' % x) #suppress scientific notations

In [57]:
le = LabelEncoder()
# le.fit_transform(all_verbs[['Tense']])
all_verbs_encoded = all_verbs
all_verbs_encoded = all_verbs_encoded.apply(lambda col: le.fit_transform(col))

In [58]:
corr_matrix = all_verbs_encoded.corr()

In [59]:
corr_matrix

Unnamed: 0,Tense,Modality,Aspect,Causation,Polarity,Person,Voice,Transitivity,Generated Verb-UR,Generated Verb-SR,Stressed Portion
Tense,1.0,0.17895382,-0.47858419,0.002182821,0.0,-0.008557094,-0.030232385,0.023765178,-0.103709195,0.001996433,-0.144532348
Modality,0.17895382,1.0,0.137153279,-0.003715693,-0.0,0.014566259,0.051462885,-0.040454123,-0.22004477,0.03404212,0.083283786
Aspect,-0.47858419,0.137153279,1.0,-0.000393072,-0.0,0.001540919,0.005444099,-0.004279517,-0.221101093,0.001237601,0.186629106
Causation,0.002182821,-0.003715693,-0.000393072,1.0,0.0,-0.000354115,-0.072201409,-0.079773209,-0.107949923,-0.10379847,0.023011675
Polarity,0.0,-0.0,-0.0,0.0,1.0,0.0,0.0,-0.0,-0.146574765,0.319393127,-0.416734825
Person,-0.008557094,0.014566259,0.001540919,-0.000354115,0.0,1.0,0.004904547,-0.003855383,0.005859857,0.001794333,-0.152010966
Voice,-0.030232385,0.051462885,0.005444099,-0.072201409,0.0,0.004904547,1.0,-0.786083469,0.055043321,0.033831673,0.071571306
Transitivity,0.023765178,-0.040454123,-0.004279517,-0.079773209,-0.0,-0.003855383,-0.786083469,1.0,0.013512036,0.032394022,-0.07916556
Generated Verb-UR,-0.103709195,-0.22004477,-0.221101093,-0.107949923,-0.146574765,0.005859857,0.055043321,0.013512036,1.0,0.524329298,0.076375276
Generated Verb-SR,0.001996433,0.03404212,0.001237601,-0.10379847,0.319393127,0.001794333,0.033831673,0.032394022,0.524329298,1.0,-0.049285734


In [60]:
all_verbs.to_csv('./all_verbs_LR.csv')

In [61]:
all_verbs

Unnamed: 0,Tense,Modality,Aspect,Causation,Polarity,Person,Voice,Transitivity,Generated Verb-UR,Generated Verb-SR,Stressed Portion
0,present,indicative,imperfective,non-causative,negative,1s,passive,intrans,a-me-*ne*-ger-i-em(ɛ),amneˈgerijemɛ,e
1,present,indicative,imperfective,non-causative,negative,2s,passive,intrans,a-me-*ne*-ger-i-i,amneˈgerii,e
2,present,indicative,imperfective,non-causative,negative,3sm,passive,intrans,a-me-*ne*-ger-i-e,amneˈgerije,e
3,present,indicative,imperfective,non-causative,negative,3sf,passive,intrans,a-me-*ne*-ger-i-iɛ,amneˈgeriijɛ,e
4,present,indicative,imperfective,non-causative,negative,1p,passive,intrans,a-me-*ne*-ger-i-om(ɛ),amneˈgerijomɛ,e
...,...,...,...,...,...,...,...,...,...,...,...
34475,past,subjunctive,perfect,non-causative,affirmative,3sm,active,trans,ʌ-χʌst-*abiʃ(ɛ)*,ʌχʌstɛˈbiʃɛ,ɛ
34476,past,subjunctive,perfect,non-causative,affirmative,3sf,active,trans,ʌ-χʌst-*abiʃ(ɛ)*,ʌχʌstɛˈbiʃɛ,ɛ
34477,past,subjunctive,perfect,non-causative,affirmative,1p,active,trans,ʌ-χʌst-*abimon*,ʌχʌstɛˈbimon,ɛ
34478,past,subjunctive,perfect,non-causative,affirmative,2p,active,trans,ʌ-χʌst-*abion*,ʌχʌstɛˈbijon,ɛ


In [62]:
all_verbs[all_verbs['Generated Verb-SR'] == 'eˈt:emɛ']

Unnamed: 0,Tense,Modality,Aspect,Causation,Polarity,Person,Voice,Transitivity,Generated Verb-UR,Generated Verb-SR,Stressed Portion
