<a href="https://colab.research.google.com/github/o-fugi/FURSPColexification/blob/main/code/Predict_Source_Probability_with_Frequency_and_Concreteness.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/ColabFiles/

Mounted at /content/drive/
/content/drive/MyDrive/ColabFiles


In [2]:
import pandas as pd
import numpy as np
from scipy.special import softmax
from functools import reduce
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [278]:
# create master dataframe with all frequency, concreteness, aoa, valence information

freqDF = pd.read_csv('/content/drive/MyDrive/ColabFiles/COCA_freqs.csv', encoding='ISO-8859-1') # w1, coca_spok
concreteDF = pd.read_csv('/content/drive/MyDrive/ColabFiles/brysbaert_concreteness.csv') # Word, Conc.M
aoaDF = pd.read_csv('/content/drive/MyDrive/ColabFiles/BristolNorms+GilhoolyLogie.csv') # WORD, AoA (100-700)
valenceDF = pd.read_csv('/content/drive/MyDrive/ColabFiles/Ratings_Warriner_et_al.csv') # Word, V.Mean.Sum [or A for arousal, D for dominance]

freqDF.rename(columns = {'L1':'Word'}, inplace = True)
aoaDF.rename(columns = {'WORD':'Word'}, inplace = True)

# if there are duplicate words, sum the frequency
freqDF = freqDF.groupby('Word').sum()

allVarsDF = reduce(lambda  left,right: pd.merge(left,right,on='Word',
                                            how='outer'), [freqDF, concreteDF, aoaDF, valenceDF])

allVarsDF = allVarsDF.set_index('Word')

filteredVarsDF = allVarsDF[['coca_spok','Conc.M', 'AoA (100-700)', 'V.Mean.Sum']].copy()
filteredVarsDF.rename(columns = {
    'coca_spok' : 'Freq',
    'Conc.M' : 'Conc',
    'AoA (100-700)': 'AoA',
    'V.Mean.Sum' : 'Val'
}, inplace=True)

filteredVarsDF = filteredVarsDF.reset_index() # add 'Word' as a column, not an index

In [264]:
# valence should be judged differently

filteredVarsDF['ValRate'] = abs(filteredVarsDF['Val']-4)

In [None]:
#@title 
# # normalize with softmax
# # actually, I don't end up using softmax in the logistic regression model

# def softmax_with_nan(col):
#   col = np.nan_to_num(col, nan=-np.inf)
#   return softmax(col)

# filteredVarsDF['normalized_freq'] = np.where(np.isnan(filteredVarsDF['Freq']), np.nan, softmax_with_nan(np.log(filteredVarsDF['Freq'].to_numpy()))) # log is necessary because otherwise it's just [1, 0, 0, 0, ...]
# filteredVarsDF['normalized_conc'] = np.where(np.isnan(filteredVarsDF['Conc']), np.nan, softmax_with_nan(filteredVarsDF['Conc']))
# filteredVarsDF['normalized_aoa'] = np.where(np.isnan(filteredVarsDF['AoA']), np.nan, softmax_with_nan(filteredVarsDF['AoA']))
# filteredVarsDF['normalized_val'] = np.where(np.isnan(filteredVarsDF['ValRate']), np.nan, softmax_with_nan(filteredVarsDF['ValRate']))

  


In [265]:
# take only words with frequency and concreteness data
filteredVarsDF = filteredVarsDF[(~np.isnan(filteredVarsDF['Freq'])) & (~np.isnan(filteredVarsDF['Conc']))]

In [266]:
# get database with source frequency for each word -- total # of realizations of source / total # of realizations 

datSemShift = pd.read_csv('/content/drive/MyDrive/ColabFiles/datsemshift.csv')

# filter semantic shifts
datSemShift = datSemShift[(datSemShift['gendirection'] == '→')]
datSemShift = datSemShift[(datSemShift['type'] == ' Semantic evolution') | (datSemShift['type'] == ' Polysemy')]
datSemShift = datSemShift[(datSemShift['language1'] == datSemShift['language2'])]
datSemShift = datSemShift[(datSemShift['lexeme1'] == datSemShift['lexeme2'])]
datSemShift = datSemShift[~datSemShift['meaning1'].str.contains('<')]
datSemShift = datSemShift[(datSemShift['status']!='Suspended') & (datSemShift['status']!='Rejected')]

# group by shifts and count realizations
semShiftDF = pd.DataFrame(datSemShift.groupby(['meaning1', 'meaning2']).size()).rename(columns={0:'realizations'})
semShiftDF = semShiftDF.reset_index()

# filter based on shifts that occured in at least two languages -- cuts the dataset about in half, but maybe good for reliability
semShiftDF = semShiftDF[semShiftDF['realizations'] > 1]

# meaning1DF = pd.DataFrame(datSemShift.groupby('meaning1').size()).rename(columns={0: 'source_freq'}).reset_index()
# meaning2DF = pd.DataFrame(datSemShift.groupby('meaning2').size()).rename(columns={0: 'targ_freq'}).reset_index()

# semShiftDF = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning1', right_on='meaning2',
#                                             how='outer'), [meaning1DF, meaning2DF])

# semShiftDF = semShiftDF.fillna(0)

# semShiftDF['source_prob'] = semShiftDF['source_freq'] / (semShiftDF['targ_freq'] + semShiftDF['source_freq'])

# semShiftDF['word'] = np.where((semShiftDF['meaning1'] == 0), semShiftDF['meaning2'], semShiftDF['meaning1'])

In [218]:
# transform datSemShift data into usable data with the frequency corpus

# import re

# def semShiftToString(s): # this is the kicker... needs more work
#   # if multiple words, delete 'to' if it's the first word
#   if (len(s.split(' ')) > 1) & (s.split(' ')[0] == 'to'):
#     s = re.sub("^to ", "", s)
#   # delete everything in parentheses
#   # s = re.sub("\(.*?\)","",s)
#   # delete everything after  / or ,
#   s = re.split('/|,', s)[0]
#   # delete trailing or leading spaces
#   s = s.strip()
#   return s

# semShiftDF['meaning1_transform'] = semShiftDF['meaning1'].apply(semShiftToString)
# semShiftDF['meaning2_transform'] = semShiftDF['meaning2'].apply(semShiftToString)

# #merge with frequency corpus 

# filteredVarsDF_NoRepeats = filteredVarsDF.drop_duplicates(subset='Word') # how to deal with duplicates in the frequency data? currently just taking more frequent

# df = reduce(lambda  left,right: pd.merge(left,right,left_on='Word', right_on='meaning1_transform', how='inner'), [filteredVarsDF_NoRepeats, semShiftDF])
# df = df.rename(columns={'Freq':'freq_1', 'Conc': 'conc_1', 'AoA': 'aoa_1', 'Val': 'val_1'})
# df = reduce(lambda  left,right: pd.merge(left,right,left_on='Word', right_on='meaning2_transform', how='inner'), [filteredVarsDF_NoRepeats, df])
# df = df.rename(columns={'Freq':'freq_2', 'Conc': 'conc_2', 'AoA': 'aoa_2', 'Val': 'val_2'})

# print("losing", len(semShiftDF) - len(df), "shifts when translating from datSemShift to freq data, leaving us with", len(df), "shifts")

losing 816 shifts when translating from datSemShift to freq data, leaving us with 674 shifts


In [268]:
# if there's more than one phrase separated by a comma or slash, take the sum of their frequencies and average of their concreteness

import re

def semShiftToPhrase(s):
  phrases = re.split('/|,', s)
  transform_phrases = []
  total_freq = 0
  total_conc = 0
  freq_data = False
  conc_data = False
  for p in phrases:
    p = p.strip()
    if (len(p.split(' ')) > 1) & (p.split(' ')[0] == 'to'):
      p = re.sub("^to ", "", p)
    transform_phrases.append(p)
    try:
      total_freq += filteredVarsDF[filteredVarsDF['Word']==p]['Freq'].values[0]
      freq_data = True
    except:
      pass
    try:
      total_conc += filteredVarsDF[filteredVarsDF['Word']==p]['Conc'].values[0]
      conc_data = True
    except:
      pass
  total_conc /= len(phrases)
  if not freq_data:
    total_freq = np.nan
  if not conc_data:
    total_conc = np.nan
  return total_freq, total_conc
  #print(filteredVarsDF_NoRepeats[filteredVarsDF_NoRepeats['Word']==transform_phrases]['Freq'])
    
semShiftDF['freq_conc'] = semShiftDF['meaning1'].apply(semShiftToPhrase)
semShiftDF[['freq_1', 'conc_1']] = pd.DataFrame(semShiftDF['freq_conc'].tolist(), index=semShiftDF.index)
semShiftDF['freq_conc'] = semShiftDF['meaning2'].apply(semShiftToPhrase)
semShiftDF[['freq_2', 'conc_2']] = pd.DataFrame(semShiftDF['freq_conc'].tolist(), index=semShiftDF.index)

# filter based on whether concreteness and frequency data exists for meaning1 and meaning2
semShiftDF = semShiftDF[~np.isnan(semShiftDF['freq_1']) & ~np.isnan(semShiftDF['conc_1']) &  ~np.isnan(semShiftDF['freq_2']) & ~np.isnan(semShiftDF['conc_2'])]

In [269]:
semShiftDF['freq_ratio'] = semShiftDF['freq_1'] / semShiftDF['freq_2']
semShiftDF['freq_ratio_accurate'] = semShiftDF['freq_ratio'] > 1
semShiftDF['freq_ratio_accurate'].value_counts()

True     399
False    297
Name: freq_ratio_accurate, dtype: int64

In [270]:
semShiftDF['conc_ratio'] = semShiftDF['conc_1'] / semShiftDF['conc_2']
semShiftDF['conc_ratio_accurate'] = semShiftDF['conc_ratio'] > 1
semShiftDF['conc_ratio_accurate'].value_counts()
# was 336 / 439 = 76%
# now 500 / 696 = 71%

True     500
False    196
Name: conc_ratio_accurate, dtype: int64

In [None]:
#@title old code from testing logistic regression

# input = df[['normalized_freq', 'normalized_conc']].copy().to_numpy()
# df['source'] = np.where(df['source_prob'] > 0.5, 1, 0)

# X = df[['Conc']].copy().to_numpy()
# y = df['source'].copy().to_numpy()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# model = LogisticRegression(solver='liblinear', class_weight='balanced').fit(X_train, y_train)

# print(model.score(X_test,y_test))

#print(confusion_matrix(y_test, model.predict(X_test)))

# softmax normalization is making the frequency distribution unusable 
# and making the concreteness distribution slightly worse
# but with neither of them normalized (and log frequency) we're getting ~60% accuracy 