<a href="https://colab.research.google.com/github/o-fugi/FURSPColexification/blob/main/code/Predict_Source_Probability_with_Frequency_and_Concreteness.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/ColabFiles/

Mounted at /content/drive/
/content/drive/MyDrive/ColabFiles


In [3]:
import pandas as pd
import numpy as np
from scipy.special import softmax
from functools import reduce
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [4]:
# create master dataframe with all frequency, concreteness, aoa, valence information

freqDF = pd.read_csv('/content/drive/MyDrive/ColabFiles/COCA_freqs.csv', encoding='ISO-8859-1') # w1, coca_spok
concreteDF = pd.read_csv('/content/drive/MyDrive/ColabFiles/brysbaert_concreteness.csv') # Word, Conc.M
aoaDF = pd.read_csv('/content/drive/MyDrive/ColabFiles/BristolNorms+GilhoolyLogie.csv') # WORD, AoA (100-700)
valenceDF = pd.read_csv('/content/drive/MyDrive/ColabFiles/Ratings_Warriner_et_al.csv') # Word, V.Mean.Sum [or A for arousal, D for dominance]

freqDF.rename(columns = {'w1':'Word'}, inplace = True)
aoaDF.rename(columns = {'WORD':'Word'}, inplace = True)

allVarsDF = reduce(lambda  left,right: pd.merge(left,right,on='Word',
                                            how='outer'), [freqDF, concreteDF, aoaDF, valenceDF])

allVarsDF = allVarsDF.set_index('Word')

filteredVarsDF = allVarsDF[['coca_spok','Conc.M', 'AoA (100-700)', 'V.Mean.Sum']].copy()
filteredVarsDF.rename(columns = {
    'coca_spok' : 'Freq',
    'Conc.M' : 'Conc',
    'AoA (100-700)': 'AoA',
    'V.Mean.Sum' : 'Val'
}, inplace=True)

In [5]:
# valence should be judged differently

filteredVarsDF['ValRate'] = abs(filteredVarsDF['Val']-4)

In [6]:
# normalize with softmax
# actually, I don't end up using softmax in the logistic regression model

def softmax_with_nan(col):
  col = np.nan_to_num(col, nan=-np.inf)
  return softmax(col)

filteredVarsDF['normalized_freq'] = np.where(np.isnan(filteredVarsDF['Freq']), np.nan, softmax_with_nan(np.log(filteredVarsDF['Freq'].to_numpy()))) # log is necessary because otherwise it's just [1, 0, 0, 0, ...]
filteredVarsDF['normalized_conc'] = np.where(np.isnan(filteredVarsDF['Conc']), np.nan, softmax_with_nan(filteredVarsDF['Conc']))
filteredVarsDF['normalized_aoa'] = np.where(np.isnan(filteredVarsDF['AoA']), np.nan, softmax_with_nan(filteredVarsDF['AoA']))
filteredVarsDF['normalized_val'] = np.where(np.isnan(filteredVarsDF['ValRate']), np.nan, softmax_with_nan(filteredVarsDF['ValRate']))

  


In [7]:
# take only words with frequency and concreteness data
filteredVarsDF = filteredVarsDF[(~np.isnan(filteredVarsDF['normalized_freq'])) & (~np.isnan(filteredVarsDF['normalized_conc']))]

In [53]:
# get database with source frequency for each word -- total # of realizations of source / total # of realizations 

datSemShift = pd.read_csv('/content/drive/MyDrive/ColabFiles/datsemshift.csv')

datSemShift = datSemShift[(datSemShift['direction'] == '→')]
datSemShift = datSemShift[(datSemShift['type'] == ' Semantic evolution')]

meaning1DF = pd.DataFrame(datSemShift.groupby('meaning1').size()).rename(columns={0: 'source_freq'}).reset_index()
meaning2DF = pd.DataFrame(datSemShift.groupby('meaning2').size()).rename(columns={0: 'targ_freq'}).reset_index()

semShiftDF = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning1', right_on='meaning2',
                                            how='outer'), [meaning1DF, meaning2DF])

semShiftDF = semShiftDF.fillna(0)

semShiftDF['source_prob'] = semShiftDF['source_freq'] / (semShiftDF['targ_freq'] + semShiftDF['source_freq'])

semShiftDF['word'] = np.where((semShiftDF['meaning1'] == 0), semShiftDF['meaning2'], semShiftDF['meaning1'])

In [56]:
# transform datSemShift data into usable data with the frequency corpus

import re

def semShiftToString(s):
  # if multiple words, delete 'to' if it's the first word
  if (len(s.split(' ')) > 1) & (s.split(' ')[0] == 'to'):
    s = re.sub("^to ", "", s)
  # delete everything in parentheses
  s = re.sub("\(.*?\)","",s)
  # delete everything after  / or ,
  s = re.split('/|,', s)[0]
  # delete trailing or leading spaces
  s = s.strip()
  return s

semShiftDF['word_transform'] = semShiftDF['word'].apply(semShiftToString)

#merge with frequency corpus 

filteredVarsDF = filteredVarsDF.reset_index()
filteredVarsDF_NoRepeats = filteredVarsDF.drop_duplicates(subset='Word') 
df = reduce(lambda  left,right: pd.merge(left,right,left_on='Word', right_on='word_transform',
                                            how='inner'), [filteredVarsDF_NoRepeats, semShiftDF])

df = df.drop_duplicates(subset='word_transform') # handling duplicates just terribly here

# handling 0 frequency just terribly here too 
df['log_freq'] = np.log(df['Freq'])
df = df[~np.isinf(df['log_freq'])]

In [60]:
# now do logistic regression on this stuff

# input = df[['normalized_freq', 'normalized_conc']].copy().to_numpy()
df['source'] = np.where(df['source_prob'] > 0.5, 1, 0)

X = df[['log_freq', 'Conc']].copy().to_numpy()
y = df['source'].copy().to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

model = LogisticRegression(solver='liblinear', class_weight='balanced').fit(X_train, y_train)

# softmax normalization is making the frequency distribution unusable 
# and making the concreteness distribution slightly worse
# but with neither of them normalized (and log frequency) we're getting ~60% accuracy 

In [61]:
model.score(X_test,y_test)

0.4318181818181818

In [62]:
confusion_matrix(y_test, model.predict(X_test))

array([[ 7, 19],
       [ 6, 12]])