In [16]:
# import packages, declare constants, get the parent directory
import pandas as pd
import numpy as np
import re
import os
import time
import decimal
import numpy as np
#from scipy import stats
#from wordcloud import WordCloud
#import matplotlib.pyplot as plt
#from .utils import helpers
#import simplemma
#import pyvoikko

CHATGPT_RELEASE_YEAR = int(2022)
FINNISH_ALPHABET = 'abcdefghijklmnopqrstuvwxyzåäö'

def get_parent_directory() -> str:
    """Get the parent directory for handling csv files.

    Returns:
        string: the path to the directory where directories for csv files are located
    """
    #create relative path for parent
    relative_parent = os.path.join(os.getcwd(), '..')

    #use abspath for absolute parent path
    return str(os.path.abspath(relative_parent)).replace('\\', '/')

directory = get_parent_directory()

DECLARE FUNCTIONS

In [2]:
def clean_string(string: str) -> str:
    try:
        # remove blanks in start and end
        string = string.strip()
        string = string.lower()
        # the string must contain characters
        if any(c in string for c in FINNISH_ALPHABET)==False:
            string = ''
        # remove tabulations, line breaks etc., also special characters
        remove_these = r'[\+\*!"”?.,…()§\'[\] \t\n\r\f\v]'
        string = re.sub(remove_these, '', string)
        # remove weird parentheses and backwards linebreaks from starts of strings
        string = re.sub(r'^\)\\[a-z]', '', string)
        # remove weird '\[alphabet]' strings at start of strings
        string = re.sub(r'^\\[a-z]', '', string)
        # remove numbers
        string = re.sub(r'[0-9]', '', string)
        # remove dashes '-' at the start and end of string
        string = re.sub(r'^-|-$', '', string)
        # remove individual forward and backward slashes '/', '\'
        string = re.sub(r'[\/\\]', '', string)
        # remove double dashes '--'
        string = string.replace('--', '-')
        # remove the equal sign '='
        string = string.replace('=', '')
        # at the end of the cleaning, remove all characters from the string which are not in the alphabet except for dash (compound words)
        remove_these = ''.join([str(c) for c in string if c != '-' and c not in [i for i in FINNISH_ALPHABET]])
        string = re.sub(remove_these, '', string)
        # remove blanks in start and end again
        string = string.strip()
        # remove empty if string length < 2
        string = '' if len(string) < 2 else string
        return string
    except:
        print(f'Unexpected error at helpers.clean_string(), string: {string}')
        raise

In [3]:
def count_word_freqs_in_string(string: str):
    """Counts the words in the input string.
    Returns a dictionary where the word is the key and the frequency is the value.
    """
    if ((string is None) or (string == 'nan')):
        return None
    else:
        words_list = re.split(' ', string)
        wordfreq_dict = {}
        for word in words_list:
            if word not in wordfreq_dict.keys():
                wordfreq_dict[word] = 1
            else:
                wordfreq_dict[word] += 1

        return wordfreq_dict

In [62]:
def get_normalised(df: pd.DataFrame, speaker_id: str, year: int, word_n: int) -> float:
    w_min = df['word_n'].loc[(df['speaker_id']==speaker_id)&(df['year']==year)].min()
    w_max = df['word_n'].loc[(df['speaker_id']==speaker_id)&(df['year']==year)].max()
    return ((word_n-w_min)/(w_max-w_min))

In [54]:
# read lemmatised csvs for years 2015-2025
# the goal is to check if word frequencies per SPEAKER change over years

df = pd.read_csv(f'{directory}/csv_lemmatized/speeches_2015.csv', sep=';', encoding='utf-8', header=0, dtype=str)

In [None]:
# clean 'speaker_id' column

df['speaker_id'] = df.apply(lambda x: str(x['speaker_id']).strip() if ((x['speaker_id'] is not None) & (x['speaker_id'] is not np.nan)) else x['speaker_id'], axis=1)
df['speaker_id'] = df.apply(lambda x: '' if x['speaker_id']=='nan' else x['speaker_id'], axis=1)
# df['speaker_id'].loc[(df['speaker_id'].notna()==True)&(df['speaker_id'].str.len()>0)].unique()

In [16]:
# create the data in loops: 
## > loop the csvs for years 2015-2025
## > extract each word per speaker per df (=year), this is done in a for loop
## > combine them all into dataframe df_speaker_words_year
# declare the years
year, max_year = 2015, 2025
# start looping the csvs
while year <= max_year:
    # name for saving the file
    save_file_name = f'speaker_words_{year}.csv'
    if save_file_name in os.listdir(f'{directory}/csv_analysis/'):
        pass
    else:
        # get the csv to match the year from directory: directory/csv_lemmatized/
        year_csv = pd.read_csv(f'{directory}/csv_lemmatized/speeches_{year}.csv', sep=';', header=0)
        # format a dataframe to store the results
        # > columns: speaker_id, year, word, word_n (how many times the word appears)
        df_speaker_words_year = pd.DataFrame(columns=['speaker_id', 'year', 'word', 'word_n']).astype({'speaker_id': str, 'year': int, 'word': str, 'word_n': int})    
        # extract each word per speaker per df (=year)
        for speaker in year_csv['speaker_id'].loc[(year_csv['speaker_id'].notna()==True)&(year_csv['speaker_id'].str.len()>0)].unique():
            # extract only rows for the speaker in iteration, and only if a lemmatized speech exists, and only if the lemmatized speech is longer than 0 chars
            df_filtered = year_csv.loc[(year_csv['speaker_id']==speaker)&(year_csv['content_lemmatized'].notna()==True)&(year_csv['content_lemmatized'].str.len()>0)]
            # extract each lemmatized speech; compile them into a single string
            speaker_all_speeches = ' '.join(df_filtered['content_lemmatized'])
            # string cleaning: clean special characters from the string
            speaker_all_speeches = ' '.join([clean_string(word) for word in speaker_all_speeches.split(' ')])
            # get the count of each word in the string, return a dict
            speaker_words_dict = count_word_freqs_in_string(speaker_all_speeches)
            # delete empty '' keys (words) from the dict, if such have made it there. these are failed lemmatizations
            try:
                del speaker_words_dict['']
            except KeyError:
                pass
            # add the speaker's subset (speaker_id, year, word, word_n) in the combination dataframe
            # > from the speaker_words_dict, and year, and speaker
            # > create another "temporary" dataframe for this, concatenate this to the df_speaker_words_year df
            concat_df = pd.DataFrame.from_dict(data=speaker_words_dict, orient='index', columns=['word_n'])
            concat_df['speaker_id'], concat_df['year'], concat_df['word'] = speaker, year, concat_df.index
            concat_df.reset_index(drop=True, inplace=True)
            concat_df = concat_df[['speaker_id','year','word','word_n']]  
            df_speaker_words_year = pd.concat([df_speaker_words_year, concat_df], axis=0, ignore_index=True)
        # store the data in a csv (savepoint!)
        # > directory and file name template: directory/csv_analysis/speaker_words_YYYY.csv
        df_speaker_words_year.to_csv(f'{directory}/csv_analysis/{save_file_name}', sep=';', header=True, index=False, encoding='utf-8')
        # time for the next year
    year = year+1

In [98]:
# data for words per speaker for years 2015-2025 has now been created
# next: pick the speakers who appear in all datasets
# > speakers who are present in all datasets 2015-2025
df_speaker_words_comp = pd.DataFrame(columns=['speaker_id', 'year', 'word', 'word_n', 'word_norm']).astype({'speaker_id':str, 'year':int, 'word':str, 'word_n':int, 'word_norm':float})
for f in [file for file in os.listdir(f'{directory}/csv_analysis/') if re.search(r'speaker_words_\d+\.csv', file)]:
    df = pd.read_csv(f'{directory}/csv_analysis/{f}', sep=';', header=0, encoding='utf-8', dtype={'speaker_id': str, 'year': int, 'word': str, 'word_n': int})
    # calculate the normalised frequency of word per speaker per year: min-max normalisation
    # store in column: word_norm
    # running this in one apply with subqueries into the dataframe causes setting with copy warnings and takes way too long due to relatively large row counts
    # > therefore let's do this step by step with fewer repetitive queries
    df['word_norm'] = None
    # loop through each speaker
    for speaker in df['speaker_id'].unique():
        # minimum and maximum word counts per speaker
        w_min, w_max = df['word_n'].loc[df['speaker_id']==speaker].min(), df['word_n'].loc[df['speaker_id']==speaker].max()
        # calculate the normalised frequency per each word of the speaker
        df.loc[df['speaker_id']==speaker, 'word_norm'] = df.apply(lambda x: (x['word_n']-w_min)/(w_max-w_min), axis=1)
    df_speaker_words_comp = pd.concat([df_speaker_words_comp, df], axis=0, ignore_index=True)

# save as csv
file_path_write = f'{directory}/csv_analysis/speaker_words_comp.csv'
try:
    df_speaker_words_comp.to_csv(file_path_write, sep=';', header=True, index=False, encoding='utf-8')
except FileExistsError:
    os.remove(file_path_write)
    df_speaker_words_comp.to_csv(file_path_write, sep=';', header=True, index=False, encoding='utf-8')

In [212]:
df_speaker_words_comp = pd.read_csv(f'{directory}/csv_analysis/speaker_words_comp.csv',sep=';',header=0,encoding='utf-8',dtype={'speaker_id': str, 'year': int, 'word': str, 'word_n': int, 'word_norm':float})
# keep only records where the speaker_id is a valid identifier (string of numbers)
df_speaker_words_comp = df_speaker_words_comp.loc[df_speaker_words_comp['speaker_id'].str.contains(pat=r'\d+')==True]

In [189]:
def linear_extrapolation(y: list, x: list, n=1) -> list:
    """Linear extrapolation based on last two x and y observations. Returns the extrapolated value of y for a given x based on y1x1 and y2x2.
    Parameter x will be standardised to a running sequence of numbers so extrapolations works on a linear scale.
    Args:
        y (list): list of values
        x (list): list of values
        n (int, optional): for how many times shall extrapolation be done. Defaults to 1. Larger values will start extrapolating on extrapolated values.

    Returns:
        list: extrapolated value(s) of y. The length of the list will be n.
    """
    # x and y must be arrays of same length
    # CHANGE THIS TO ASSERT
    if len(y) != len(x):
        print('array length mismatch')
        return None
    else:
        # format helper parameters to not modify lists outside the function
        xx = [*[i for i in x]]
        yy = [*[i for i in y]]
        return_list = [] # format list to be returned
        # loop n times -> return list of n length with n extrapolations
        # note: extrapolating on extrapolations if n>1
        while n >= 1:
            # format xx: it shall take a running sequence of numbers as its values
            xx = [i for i in range(len(x))]
            m = (yy[-1] - yy[-2]) / (xx[-1] - xx[-2])
            y_v = yy[-2] + m * ((xx[-1] + 1) - xx[-2])
            xx.append([xx[-1]+1])
            yy.append(y_v)
            return_list.append(y_v)
            n = n-1
        return return_list

In [None]:
df_speaker_words_comp['word_norm_extrap'] = None
years = sorted(df_speaker_words_comp['year'].unique().tolist())
years_antegpt = sorted([y for y in years if y <= CHATGPT_RELEASE_YEAR])
# compute extrapolated values of word frequencies only for this range of years
loop_years = [2023, 2024, 2025]
for year in loop_years:
    # a list to pass into linear extrapolation as x: two previous years upon which to extrapolate
    years_extrap = [year-2, year-1]
    speakers_year = df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==year].unique().tolist()
    for speaker in speakers_year:
        # if the speaker is not in the sets of the previous two years -> skip processing the speaker's words
        # linear extrapolation would return None for the speaker and processing takes a lot of time
        if (speaker not in df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==year-1].unique()) and (speaker not in df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==year-2].unique()):
            pass
        else:
            speaker_words_year = df_speaker_words_comp['word'].loc[(df_speaker_words_comp['year']==year)&(df_speaker_words_comp['speaker_id']==speaker)].unique().tolist()
            #speaker_words_year = ['puhuja', 'ei', 'tämä']
            # compute linear extrapolation of word
            for word in speaker_words_year:
                # list the normalised values of word frequencies from previous TWO years for the extrapolation
                # > create a list for this
                # > ARBITRARY TWO (2) YEARS: the linear extrapolation function handles only 2 years and this is done to limit the amount of looping
                # > change this if 2 is not enough, but for the function to work 2 _is_ enough
                list_word_norms = []
                for y in years_extrap:
                    list_word_norms.append(df_speaker_words_comp['word_norm'].loc[(df_speaker_words_comp['year']==y)&(df_speaker_words_comp['speaker_id']==speaker)&(df_speaker_words_comp['word']==word)].values.astype(float))
                print(f'year: {year} - speaker: {speaker} - word: {word} - list_word_norms: {list_word_norms}')
                word_norm_extrap = linear_extrapolation(y=list_word_norms, x=years_extrap, n=1)[0]
                print(f'word_norm_extrap: {word_norm_extrap} - len(word_norm_extrap): {len(word_norm_extrap)}')
                # add the extrapolated value for the word only if such value has been created
                if len(word_norm_extrap)==1:
                    df_speaker_words_comp.loc[(df_speaker_words_comp['year']==year)&(df_speaker_words_comp['speaker_id']==speaker)&(df_speaker_words_comp['word']==word), 'word_norm_extrap'] = word_norm_extrap

In [258]:
df_speaker_words_comp = pd.read_csv(f'{directory}/csv_analysis/speaker_words_comp.csv',sep=';',header=0,encoding='utf-8',dtype={'speaker_id': str, 'year': int, 'word': str, 'word_n': int, 'word_norm':float})
# keep only records where the speaker_id is a valid identifier (string of numbers)
df_speaker_words_comp = df_speaker_words_comp.loc[df_speaker_words_comp['speaker_id'].str.contains(pat=r'\d+')==True]

In [None]:
# compute extrapolated values of word frequencies only for this range of years
loop_years = [2023, 2024, 2025]
for year in loop_years:
    # name for a new column: extrapolated word frequency
    # > a new column will be created for each looped year
    col_word_norm_extrap = f'word_norm_extrap_{year}'
    #df_speaker_words_comp[col_word_norm_extrap] = None
    # merging the dataframe will speed this up instead of looping each row
    # > create a new df to quickly compute the linear extrapolation for word frequency: df_merge
    # > df_merge is created by joining df_speaker_words_comp for year, said df year-1 and year-2 on speaker_id and word
    # > INNER JOIN filters rows where there are no normalised frequencies in the previous years -> extrapolation for these words would fail
    # > LEFT JOIN keeps all recrods - this can be done to keep the records 
    # > finally the extrapolated normalised frequency is updated into df_speaker_words_comp in the update
    df_merge = df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[df_speaker_words_comp['year']==year].merge(
                   df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[df_speaker_words_comp['year']==year-1],
                   how='left', on=['speaker_id', 'word'], suffixes=(None,'_t_1'))
    df_merge = df_merge.merge(df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[df_speaker_words_comp['year']==year-2],
        how='left', on=['speaker_id', 'word'], suffixes=(None,'_t_2'))
    df_merge[col_word_norm_extrap] = df_merge.apply(lambda x: linear_extrapolation(y=[x['word_norm_t_2'], x['word_norm_t_1']],x=[year-2, year-1],n=1)[0], axis=1)
    df_speaker_words_comp = df_speaker_words_comp.merge(df_merge[['speaker_id','word','year',col_word_norm_extrap]], how='left', on=['speaker_id','word','year'])

Test if the getting extrapolations for years 2023, 2024 and 2025 worked.

```
df_speaker_words_comp.loc[(df_speaker_words_comp['speaker_id']=='1126')&(df_speaker_words_comp['word']=='viikko')]
```

Next collapse word_norm_extrap_[2023, 2024, 2025] into a single column, word_norm_extrap, since year is already a column in the df.

In [265]:
df_speaker_words_comp.loc[df_speaker_words_comp['year']==2023, 'word_norm_extrap'] = df_speaker_words_comp['word_norm_extrap_2023'] 
df_speaker_words_comp.loc[df_speaker_words_comp['year']==2024, 'word_norm_extrap'] = df_speaker_words_comp['word_norm_extrap_2024'] 
df_speaker_words_comp.loc[df_speaker_words_comp['year']==2025, 'word_norm_extrap'] = df_speaker_words_comp['word_norm_extrap_2025']
# keep only columns speaker_id, year, word, word_n, word_norm, word_norm_extrap
df_speaker_words_comp = df_speaker_words_comp[['speaker_id','year','word','word_n','word_norm','word_norm_extrap']]

Validate that collapsing the columns worked.

<code>df_speaker_words_comp.loc[(df_speaker_words_comp['speaker_id']=='1126')&(df_speaker_words_comp['word']=='viikko')]</code>

In [266]:
# compute differences and ratios between actual normalised frequencies and the extrapolated values:
# > diffs: actual value minus extrapolated value
# > ratios: actual value divided by extrapolated value
# >> replace extrapolated value with 1 to avoid divide by zero errors
df_speaker_words_comp['word_diffs'] = df_speaker_words_comp.apply(lambda x: x['word_norm'] - x['word_norm_extrap'], axis=1)
df_speaker_words_comp['word_ratios'] = df_speaker_words_comp.apply(lambda x: x['word_norm'] / (1 if (x['word_norm_extrap'] is None or x['word_norm_extrap']==np.float64(0)) else x['word_norm_extrap']), axis=1)
df_speaker_words_comp.loc[(df_speaker_words_comp['speaker_id']=='1126')&(df_speaker_words_comp['word']=='viikko')]

Unnamed: 0,speaker_id,year,word,word_n,word_norm,word_norm_extrap,word_diffs,word_ratios
166269,1126,2015,viikko,1,0.0,,,
320467,1126,2016,viikko,2,0.00093,,,
920249,1126,2017,viikko,1,0.0,,,
1500596,1126,2018,viikko,2,0.001036,,,
2001247,1126,2020,viikko,4,0.003916,,,
2606891,1126,2021,viikko,5,0.004785,,,
3416948,1126,2022,viikko,1,0.0,,,
3668046,1126,2023,viikko,3,0.003984,-0.004785,0.008769,-0.832669
3822633,1126,2024,viikko,14,0.009738,0.007968,0.00177,1.222097
4295542,1126,2025,viikko,21,0.016103,0.015492,0.000611,1.039471


In [268]:
df_speaker_words_comp['speaker_id'].loc[(df_speaker_words_comp['year']==2024)&(df_speaker_words_comp['word_ratios']>5)].unique()

array(['1147', '947', '1392', '1126', '1430', '784', '1385', '1310',
       '1157', '464', '1400', '1325', '1380', '1328', '1149', '1422',
       '1334', '1424', '1106', '1387', '1306', '1384', '1388', '1089',
       '797', '1429', '1437', '778', '1454', '1417', '1383', '967',
       '1409', '1133', '301', '1418', '1403', '1137', '1408', '1265',
       '1301', '1411', '1129', '1449', '1141', '511', '1135', '1450',
       '1331', '583', '1298', '1312', '1300', '1302', '612', '1326',
       '1131', '1382', '1094', '1452', '1096', '1455', '1282', '1299',
       '971', '1451', '1349', '351', '1396', '953', '358', '1445', '1345',
       '1401', '1426', '1097', '538', '1099', '1468', '1340', '1402',
       '1440', '1391', '1447', '1483', '1219', '499', '1341', '1398',
       '1323', '1469', '1443', '1435', '1144', '1415', '960', '1338',
       '963', '1390', '1100', '1439', '1410', '1134', '1093', '772',
       '1327', '1308', '1433', '1318', '794', '1428', '1379', '970',
       '1314', '144

In [None]:
years = sorted(df_speaker_words_comp['year'].unique().tolist())
years_antegpt = sorted([y for y in years if y <= CHATGPT_RELEASE_YEAR])
#df_speaker_words_comp['word_norm_extrap'] = None
# compute extrapolated values of word frequencies only for this range of years
loop_years = [2023, 2024, 2025]
for year in loop_years:
    # a list to pass into linear extrapolation as x: two previous years upon which to extrapolate
    years_extrap = [year-2, year-1]
    # name for a new column: extrapolated word frequency
    # > a new column will be created for each looped year
    #col_word_norm_extrap = f'word_norm_extrap_{year}'
    #df_speaker_words_comp[col_word_norm_extrap] = None
    # merging the dataframe will speed this up instead of looping each row
    # > create a new df to quickly compute the linear extrapolation for word frequency: df_merge
    # > df_merge is created by joining df_speaker_words_comp for year, said df year-1 and year-2 on speaker_id and word
    # > INNER JOIN filters rows where there are no normalised frequencies in the previous years -> extrapolation for these words would fail
    # > LEFT JOIN keeps all recrods - this can be done to keep the records 
    # > finally the extrapolated normalised frequency is updated into df_speaker_words_comp in the update
    df_merge = df_speaker_words_comp[['speaker_id','year','word','word_norm']].merge(
                    df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[df_speaker_words_comp['year']==year-1],
                    how='left', on=['speaker_id', 'word'], suffixes=(None,'_t_1'))
#   df_merge = df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[df_speaker_words_comp['year']==year].merge(
#                   df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[df_speaker_words_comp['year']==year-1],
#                   how='left', on=['speaker_id', 'word'], suffixes=(None,'_t_1'))
    df_merge = df_merge.merge(df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[df_speaker_words_comp['year']==year-2],
                    how='left', on=['speaker_id', 'word'], suffixes=(None,'_t_2'))
    df_merge['word_norm_extrap'] = df_merge.apply(lambda x: linear_extrapolation(y=[x['word_norm_t_2'], x['word_norm_t_1']],x=years_extrap,n=1)[0], axis=1)
    df_speaker_words_comp = df_speaker_words_comp.merge(df_merge[['speaker_id','word','year','word_norm_extrap']], how='left', on=['speaker_id','word','year'])
    #df_speaker_words_comp[['speaker_id','word','year','word_norm_extrap']].update(df_merge[['speaker_id','word','year','word_norm_extrap']])
    #df_speaker_words_comp.loc[df_speaker_words_comp['year']==year].update(df_merge[['speaker_id','word','year','word_norm_extrap']])
    #df_speaker_words_comp.mask(df_speaker_words_comp[['speaker_id','word','year']].isin(df_merge[['speaker_id','word','year']]), df_merge['word_norm_extrap'], inplace=True, axis=1)
    #df_speaker_words_comp.update(df_merge)

In [None]:
years = sorted(df_speaker_words_comp['year'].unique().tolist())
years_antegpt = sorted([y for y in years if y <= CHATGPT_RELEASE_YEAR])
df_speaker_words_comp['word_norm_extrap'] = None
# compute extrapolated values of word frequencies only for this range of years
loop_years = [2023, 2024, 2025]
for year in loop_years:
    # a list to pass into linear extrapolation as x: two previous years upon which to extrapolate
    years_extrap = [year-2, year-1]
    # name for a new column: extrapolated word frequency
    # > a new column will be created for each looped year
    #col_word_norm_extrap = f'word_norm_extrap_{year}'
    #df_speaker_words_comp[col_word_norm_extrap] = None
    # merging the dataframe will speed this up instead of looping each row
    # > create a new df to quickly compute the linear extrapolation for word frequency: df_merge
    # > df_merge is created by joining df_speaker_words_comp for year, said df year-1 and year-2 on speaker_id and word
    # > INNER JOIN filters rows where there are no normalised frequencies in the previous years -> extrapolation for these words would fail
    # > LEFT JOIN keeps all recrods - this can be done to keep the records 
    # > finally the extrapolated normalised frequency is updated into df_speaker_words_comp in the update
    df_merge = df_speaker_words_comp[['speaker_id','year','word','word_norm']].merge(
                    df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[df_speaker_words_comp['year']==year-1],
                    how='left', on=['speaker_id', 'word'], suffixes=(None,'_t_1'))
#   df_merge = df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[df_speaker_words_comp['year']==year].merge(
#                   df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[df_speaker_words_comp['year']==year-1],
#                   how='left', on=['speaker_id', 'word'], suffixes=(None,'_t_1'))
    df_merge = df_merge.merge(df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[df_speaker_words_comp['year']==year-2],
                    how='left', on=['speaker_id', 'word'], suffixes=(None,'_t_2'))
    df_merge['word_norm_extrap'] = df_merge.apply(lambda x: linear_extrapolation(y=[x['word_norm_t_2'], x['word_norm_t_1']],x=years_extrap,n=1)[0], axis=1)
    #df_speaker_words_comp = df_speaker_words_comp.merge(df_merge[['speaker_id','word','year','word_norm_extrap']], how='left', on=['speaker_id','word','year'])
    #df_speaker_words_comp[['speaker_id','word','year','word_norm_extrap']].update(df_merge[['speaker_id','word','year','word_norm_extrap']])
    #df_speaker_words_comp.loc[df_speaker_words_comp['year']==year].update(df_merge[['speaker_id','word','year','word_norm_extrap']])
    #df_speaker_words_comp.mask(df_speaker_words_comp[['speaker_id','word','year']].isin(df_merge[['speaker_id','word','year']]), df_merge['word_norm_extrap'], inplace=True, axis=1)
    df_speaker_words_comp.update(df_merge)

In [None]:
# '1126'
#print(speaker_words_year)
#print(list_word_norms)
#list_word_norms = []
#.append(df_speaker_words_comp['word_norm'].loc[(df_speaker_words_comp['year']==y)&(df_speaker_words_comp['speaker_id']==speaker)&(df_speaker_words_comp['word']==word)].values)
#print(list_word_norms)
#print(year)
#print(y)
#print(speaker)
#print(word)
#rint(list_word_norms, years_extrap)
#print(word_norm_extrap)
#df_speaker_words_comp['word_norm'].loc[(df_speaker_words_comp['year']==2023)&(df_speaker_words_comp['speaker_id']==speaker)&(df_speaker_words_comp['word']==word)]
#df_speaker_words_comp['word'].loc[(df_speaker_words_comp['year']==2023)&(df_speaker_words_comp['speaker_id']=='1503')&(df_speaker_words_comp['word']=='ja')]
#df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==2023]
#test_list = []
#test_list.append(df_speaker_words_comp['word_norm'].loc[(df_speaker_words_comp['year']==2023)&(df_speaker_words_comp['speaker_id']==speaker)&(df_speaker_words_comp['word']==word)].values)
#print(test_list)
#df_speaker_words_comp['word_norm'].loc[(df_speaker_words_comp['year']==2023)&(df_speaker_words_comp['speaker_id']==speaker)&(df_speaker_words_comp['word']==word)].values[0].astype(float)
#df_speaker_words_comp['year'].unique()
#re.search(pattern=r'\d+', string=df_speaker_words_comp['speaker_id'].unique())
#df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['speaker_id'].str.contains(pat=r'\d+')==False].unique()
#df_speaker_words_comp.loc[df_speaker_words_comp['word_norm_extrap'].notna()==True]
# 1096
df_merge = df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[(df_speaker_words_comp['year']==2023)&(df_speaker_words_comp['speaker_id']=='1096')].merge(
         df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[(df_speaker_words_comp['year']==2022)&(df_speaker_words_comp['speaker_id']=='1096')],
         how='inner', on=['speaker_id', 'word'], suffixes=(None,'_t_1'))
df_merge = df_merge.merge(df_speaker_words_comp[['speaker_id','year','word','word_norm']].loc[(df_speaker_words_comp['year']==2021)&(df_speaker_words_comp['speaker_id']=='1096')],
         how='inner', on=['speaker_id', 'word'], suffixes=(None,'_t_2'))
df_merge['word_norm_extrap'] = df_merge.apply(lambda x: linear_extrapolation(y=[x['word_norm_t_2'], x['word_norm_t_1']],x=[2021,2022],n=1)[0], axis=1)
df_speaker_words_comp[['speaker_id','year','word','word_norm_extrap']].loc[(df_speaker_words_comp['year']==2023)&(df_speaker_words_comp['speaker_id']=='1096')].update(df_merge[['speaker_id','year','word','word_norm_extrap']])


In [None]:
#df_speaker_words_comp.loc[df_speaker_words_comp['word_norm_extrap'].notna()==True]
#df_speaker_words_comp['word_norm'].loc[(df_speaker_words_comp['year']==y)&(df_speaker_words_comp['speaker_id']==speaker)&(df_speaker_words_comp['word']==word)]
#df_speaker_words_comp['word_norm'].loc[(df_speaker_words_comp['year']==y)&(df_speaker_words_comp['speaker_id']==speaker)]
#df_speaker_words_comp['word_norm'].loc[(df_speaker_words_comp['year']==y)]
#df_speaker_words_comp['word_norm'].loc[(df_speaker_words_comp['speaker_id']==speaker)]

Unnamed: 0,speaker_id,year,word,word_n,word_norm,word_norm_extrap


In [38]:
df_speaker_words_comp= df_speaker_words_comp[df_speaker_words_comp['speaker_id'].isin(df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==2015])&
   df_speaker_words_comp['speaker_id'].isin(df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==2016])&
   df_speaker_words_comp['speaker_id'].isin(df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==2017])&
   df_speaker_words_comp['speaker_id'].isin(df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==2018])&
   df_speaker_words_comp['speaker_id'].isin(df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==2019])&
   df_speaker_words_comp['speaker_id'].isin(df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==2020])&
   df_speaker_words_comp['speaker_id'].isin(df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==2021])&
   df_speaker_words_comp['speaker_id'].isin(df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==2022])&
   df_speaker_words_comp['speaker_id'].isin(df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==2023])&
   df_speaker_words_comp['speaker_id'].isin(df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==2024])&
   df_speaker_words_comp['speaker_id'].isin(df_speaker_words_comp['speaker_id'].loc[df_speaker_words_comp['year']==2025])]

In [None]:
# calculate the normalised frequency of word per speaker per year: min-max normalisation
# store in column: word_norm
df_speaker_words_comp['word_norm'] = None
for i, d in df_speaker_words_comp.iterrows():
    w_min = df_speaker_words_comp['word_n'].loc[(df_speaker_words_comp['speaker_id']==d.speaker_id)&(df_speaker_words_comp['year']==d.year)].min()
    w_max = df_speaker_words_comp['word_n'].loc[(df_speaker_words_comp['speaker_id']==d.speaker_id)&(df_speaker_words_comp['year']==d.year)].max()
    # (x[col]-col_min)/(col_max-col_min)
    d.word_norm = (d.word_n-w_min)/(w_max-w_min)

In [None]:
df_speaker_words_comp['word_n'].loc[(df_speaker_words_comp['speaker_id']=='1096')&(df_speaker_words_comp['year']==2015)].min()

np.int64(635)

In [None]:
def get_normalised(speaker_id: str, year: int, word_n: int) -> float:
    w_min = df_speaker_words_comp['word_n'].loc[(df_speaker_words_comp['speaker_id']==speaker_id)&(df_speaker_words_comp['year']==year)].min()
    w_max = df_speaker_words_comp['word_n'].loc[(df_speaker_words_comp['speaker_id']==speaker_id)&(df_speaker_words_comp['year']==year)].max()
    return ((word_n-w_min)/(w_max-w_min))

df_speaker_words_comp['word_norm'] = None
df_speaker_words_comp['word_norm'] = df_speaker_words_comp.apply(lambda x: get_normalised(x['speaker_id'], x['year'], x['word_n']), axis=1)

In [59]:
test_df = df.loc[df['speaker_id']=='127']

In [69]:
test_df['word_norm'] = test_df.apply(lambda x: get_normalised(test_df, x['speaker_id'], x['year'], x['word_n']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['word_norm'] = test_df.apply(lambda x: get_normalised(test_df, x['speaker_id'], x['year'], x['word_n']), axis=1)


In [66]:
test_df.loc[test_df['word_norm']==1]

Unnamed: 0,speaker_id,year,word,word_n,word_norm
30,127,2015,olla,207,1.0
