In [91]:
# import packages, declare constants, get the parent directory
import pandas as pd
import numpy as np
import re
import os
import time
from scipy import stats
#from wordcloud import WordCloud
import matplotlib.pyplot as plt
#from .utils import helpers
import simplemma
import pyvoikko
import decimal
import numpy as np

FINNISH_ALPHABET = 'abcdefghijklmnopqrstuvwxyzåäö'

def get_parent_directory() -> str:
    """Get the parent directory for handling csv files.

    Returns:
        string: the path to the directory where directories for csv files are located
    """
    #create relative path for parent
    relative_parent = os.path.join(os.getcwd(), '..')

    #use abspath for absolute parent path
    return str(os.path.abspath(relative_parent)).replace('\\', '/')

directory = get_parent_directory()

DECLARE FUNCTIONS

In [92]:
def clean_string(string: str) -> str:
    try:
        # remove blanks in start and end
        string = string.strip()
        string = string.lower()
        # the string must contain characters
        if any(c in string for c in FINNISH_ALPHABET)==False:
            string = ''
        # remove tabulations, line breaks etc., also special characters
        remove_these = r'[\+\*!"”?.,…()§\'[\] \t\n\r\f\v]'
        string = re.sub(remove_these, '', string)
        # remove weird parentheses and backwards linebreaks from starts of strings
        string = re.sub(r'^\)\\[a-z]', '', string)
        # remove weird '\[alphabet]' strings at start of strings
        string = re.sub(r'^\\[a-z]', '', string)
        # remove numbers
        string = re.sub(r'[0-9]', '', string)
        # remove dashes '-' at the start and end of string
        string = re.sub(r'^-|-$', '', string)
        # remove individual forward and backward slashes '/', '\'
        string = re.sub(r'[\/\\]', '', string)
        # remove double dashes '--'
        string = string.replace('--', '-')
        # remove the equal sign '='
        string = string.replace('=', '')
        # at the end of the cleaning, remove all characters from the string which are not in the alphabet except for dash (compound words)
        remove_these = ''.join([str(c) for c in string if c != '-' and c not in [i for i in FINNISH_ALPHABET]])
        string = re.sub(remove_these, '', string)
        # remove blanks in start and end again
        string = string.strip()
        # remove empty if string length < 2
        string = '' if len(string) < 2 else string
        return string
    except:
        print(f'Unexpected error at helpers.clean_string(), string: {string}')
        raise

In [93]:
def count_word_freqs_in_string(string: str):
    """Counts the words in the input string.
    Returns a dictionary where the word is the key and the frequency is the value.
    """
    if ((string is None) or (string == 'nan')):
        return None
    else:
        words_list = re.split(' ', string)
        wordfreq_dict = {}
        for word in words_list:
            if word not in wordfreq_dict.keys():
                wordfreq_dict[word] = 1
            else:
                wordfreq_dict[word] += 1

        return wordfreq_dict

In [54]:
# read lemmatised csvs for years 2015-2025
# the goal is to check if word frequencies per SPEAKER change over years

df = pd.read_csv(f'{directory}/csv_lemmatized/speeches_2015.csv', sep=';', encoding='utf-8', header=0, dtype=str)

In [None]:
# clean 'speaker_id' column

df['speaker_id'] = df.apply(lambda x: str(x['speaker_id']).strip() if ((x['speaker_id'] is not None) & (x['speaker_id'] is not np.nan)) else x['speaker_id'], axis=1)
df['speaker_id'] = df.apply(lambda x: '' if x['speaker_id']=='nan' else x['speaker_id'], axis=1)
# df['speaker_id'].loc[(df['speaker_id'].notna()==True)&(df['speaker_id'].str.len()>0)].unique()

In [139]:
# create the data in loops: 
## > loop the csvs for years 2015-2025
## > extract each word per speaker per df (=year), this is done in a for loop
## > combine them all into dataframe df_speaker_words_year
# declare the years
year, max_year = 2015, 2025
# start looping the csvs
while year <= max_year:
    # name for saving the file
    save_file_name = f'speaker_words_{year}.csv'
    if save_file_name in os.listdir(f'{directory}/csv_analysis/'):
        pass
    else:
        # get the csv to match the year from directory: directory/csv_lemmatized/
        year_csv = pd.read_csv(f'{directory}/csv_lemmatized/speeches_{year}.csv', sep=';', header=0)
        # format a dataframe to store the results
        # > columns: speaker_id, year, word, word_n (how many times the word appears)
        df_speaker_words_year = pd.DataFrame(columns=['speaker_id', 'year', 'word', 'word_n']).astype({'speaker_id': str, 'year': int, 'word': str, 'word_n': int})    
        # extract each word per speaker per df (=year)
        for speaker in year_csv['speaker_id'].loc[(year_csv['speaker_id'].notna()==True)&(year_csv['speaker_id'].str.len()>0)].unique():
            # extract only rows for the speaker in iteration, and only if a lemmatized speech exists, and only if the lemmatized speech is longer than 0 chars
            df_filtered = year_csv.loc[(year_csv['speaker_id']==speaker)&(year_csv['content_lemmatized'].notna()==True)&(year_csv['content_lemmatized'].str.len()>0)]
            # extract each lemmatized speech; compile them into a single string
            speaker_all_speeches = ' '.join(df_filtered['content_lemmatized'])
            # string cleaning: clean special characters from the string
            speaker_all_speeches = ' '.join([clean_string(word) for word in speaker_all_speeches.split(' ')])
            # get the count of each word in the string, return a dict
            speaker_words_dict = count_word_freqs_in_string(speaker_all_speeches)
            # delete empty '' keys (words) from the dict, if such have made it there. these are failed lemmatizations
            try:
                del speaker_words_dict['']
            except KeyError:
                pass
            # add the speaker's subset (speaker_id, year, word, word_n) in the combination dataframe
            # > from the speaker_words_dict, and year, and speaker
            # > create another "temporary" dataframe for this, concatenate this to the df_speaker_words_year df
            concat_df = pd.DataFrame.from_dict(data=speaker_words_dict, orient='index', columns=['word_n'])
            concat_df['speaker_id'], concat_df['year'], concat_df['word'] = year, speaker, concat_df.index
            concat_df.reset_index(drop=True, inplace=True)
            concat_df = concat_df[['speaker_id','year','word','word_n']]  
            df_speaker_words_year = pd.concat([df_speaker_words_year, concat_df], axis=0, ignore_index=True)
            #for k, v in speaker_words_dict.items():
            #    df_speaker_words_year = pd.concat([df_speaker_words_year, pd.DataFrame.from_dict(data={'speaker_id': [speaker], 'year': [year], 'word': [k], 'word_n': [v]}, orient='columns')], axis=0, ignore_index=True)
        # store the data in a csv (savepoint!)
        # > directory and file name template: directory/csv_analysis/speaker_words_YYYY.csv
        df_speaker_words_year.to_csv(f'{directory}/csv_analysis/{save_file_name}', sep=';', header=True, index=False, encoding='utf-8')
        # time for the next year
    year = year+1

In [138]:
#df_speaker_words_year['speaker_id'].unique()
#pd.DataFrame.from_dict(data={'speaker_id': [speaker], 'year': [year], 'word': [speaker_words_dict.keys()], 'word_n': [speaker_words_dict.values()]})
a = pd.DataFrame.from_dict(data=speaker_words_dict, orient='index', columns=['word_n'])
#a['word'] = a.index
#a['speaker_id'], a['year'], a['word'] = year, speaker, a.index
#a.reset_index(drop=True, inplace=True)
#a = a[['speaker_id','year','word','word_n']]
print(a)

           word_n
arvoisa        43
rouva          15
puhemies       48
haluta         12
aloittaa        2
...           ...
korjaus         1
lindtman        1
vähennys        1
taksiauto       1
hankinta        1

[1340 rows x 1 columns]
