In [None]:
# Checking if connected to the GPU

%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
# Importing libraries
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
import time
from google.colab import files

# scikit learn packages 
from sklearn.feature_extraction.text import TfidfVectorizer # vectorizer
from sklearn.model_selection import train_test_split # For splitting into test and train data
from sklearn.linear_model import SGDClassifier # SGD classifier - better model for classification
from sklearn.pipeline import Pipeline # Pipeline framework
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, cohen_kappa_score
from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold

In [None]:
# Python Program to Convert seconds
# into hours, minutes and seconds
  
def convert(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60

    if hour == 0:
      return "%02d:%02d" % (minutes, seconds)
    else:
      return "%d:%02d:%02d" % (hour, minutes, seconds)

In [None]:
# Path to the data
filepath = "https://github.com/ghelms/data/blob/master/folketinget_2009_2021_raw.csv?raw=true"
# Importing data - only a subset 
df = pd.read_csv(filepath)

In [None]:
df['Parti'].value_counts()

Socialdemokratiet              53025
Venstre                        48875
Dansk Folkeparti               41183
Enhedslisten                   31244
Socialistisk Folkeparti        20085
Det Konservative Folkeparti    15040
Radikale Venstre               13350
Liberal Alliance               12527
Alternativet                    7325
Nye Borgerlige                  3311
Det Radikale Venstre            2239
Uden for partierne               587
Fremskridtspartiet               553
Ny Alliance                      470
Konservative Folkeparti          102
Venstresocialisterne              19
Name: Parti, dtype: int64

In [None]:
# Removing irrelevant parties and changing the names
# Fixing name bug
df["Parti"] = df["Parti"].replace(["Konservative Folkeparti"], ["Det Konservative Folkeparti"])

# Removing parties
list_of_parties_to_remove = ["Uden for partierne", "Venstresocialisterne"]
df = df[~df["Parti"].isin(list_of_parties_to_remove)]

In [None]:
df['Parti'].value_counts()

Socialdemokratiet              53025
Venstre                        48875
Dansk Folkeparti               41183
Enhedslisten                   31244
Socialistisk Folkeparti        20085
Det Konservative Folkeparti    15142
Radikale Venstre               13350
Liberal Alliance               12527
Alternativet                    7325
Nye Borgerlige                  3311
Det Radikale Venstre            2239
Fremskridtspartiet               553
Ny Alliance                      470
Name: Parti, dtype: int64

# Implementing the pipeline

In [None]:
# Defining the pipe
pipe = Pipeline([
('vect', TfidfVectorizer(ngram_range=(1,2))),
('SGD', SGDClassifier(loss='log', 
                      penalty='l2', 
                      shuffle=True,
                      alpha=1e-2,
                      class_weight = 'balanced'))])

grid = GridSearchCV(estimator = pipe,
                        param_grid = {'SGD__alpha': (1e-3, 1e-4, 1e-5, 1e-6, 1e-7)},
                        cv=20, # number of cross validations
                        scoring={'kappa_score': make_scorer(cohen_kappa_score)},
                        refit='kappa_score',
                        n_jobs=2)

In [None]:
# Creating data frame to append to
cv_results = pd.DataFrame(columns = ["Year","k_fold","alpha","accuracy_score",
                                     "precision_score","recall_score","kappa_train", 
                                     "kappa_test", "y_test","y_pred"])
# Define number of folds
n_folds = 5

# Timing the loop
start_time = time.time()

for year in sorted(df.Year.unique()):
    print("Year: {}".format(year))

    # Timing the loop
    start_of_loop = time.time()

    # Subsetting the data pr. year
    session_data = df[df['Year'] == year]
    
    # Defining the folds
    skf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state = 1234)
    
    # Looping through each fold. 
    for i, (train_index, test_index) in enumerate(skf.split(session_data, session_data["Parti"].values)):

        # Start time for fold
        start_fold_time = time.time()

        # Dividing into train and test set
        train = session_data.iloc[train_index]
        test = session_data.iloc[test_index]

        # Giving labels 
        X_train, y_train = train["text"], train["Parti"]
        X_test, y_test = test["text"], test["Parti"]

        # Fitting the grid
        print("Fitting the grid for year: {} and fold {}".format(year, i+1))
        grid.fit(X_train, y_train)

        # save best kappa score and params generating that score
        kappa_train = grid.best_score_
        alpha = grid.best_params_.get("SGD__alpha")

        # metrics on test set
        print("Predicting and extracting scores")
        y_pred = grid.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        kappa_test = cohen_kappa_score(y_test, y_pred)

        # Adding the values to a dictionary
        hep = {"Year": year, "k_fold": i+1, "alpha": alpha, "accuracy_score": accuracy, 
               "precision_score": precision, "recall_score": recall, "kappa_train": kappa_train, "kappa_test": kappa_test,
              "y_test": y_test.tolist(), "y_pred": y_pred}
                          
        # Appending to the final dataframe
        cv_results = cv_results.append(hep, ignore_index = True)
        
        # Printing time stamp
        fold_time = convert(time.time() - start_fold_time)
        print("elapsed time for fold:", fold_time , "\n")
      
    # Getting the time
    elapsed_time = time.time()
    print("Run time for year {}: ".format(year), convert(elapsed_time - start_of_loop), ". Total run time: ", convert(elapsed_time - start_of_loop), "\n")
        
print("FINISH")

files.download("cv_results.csv")

Year: 2009
Fitting the grid for year: 2009 and fold 1




Predicting and extracting scores
elapsed time for fold: 07:00 

Fitting the grid for year: 2009 and fold 2




Predicting and extracting scores
elapsed time for fold: 06:57 

Fitting the grid for year: 2009 and fold 3




Predicting and extracting scores
elapsed time for fold: 07:07 

Fitting the grid for year: 2009 and fold 4




Predicting and extracting scores


  _warn_prf(average, modifier, msg_start, len(result))


elapsed time for fold: 06:58 

Fitting the grid for year: 2009 and fold 5
Predicting and extracting scores
elapsed time for fold: 06:59 

Run time for year 2009:  35:02 . Total run time:  35:02 

Year: 2010
Fitting the grid for year: 2010 and fold 1




Predicting and extracting scores
elapsed time for fold: 25:40 

Fitting the grid for year: 2010 and fold 2




Predicting and extracting scores
elapsed time for fold: 24:26 

Fitting the grid for year: 2010 and fold 3


# Alternative more manual method

In [None]:
# Year to run
year = 2010

# Creating data frame to append to
cv_results = pd.DataFrame(columns = ["Year","k_fold","alpha","accuracy_score",
                                     "precision_score","recall_score","kappa_train", 
                                     "kappa_test", "y_test","y_pred"])
# Define number of folds
n_folds = 5

# Timing the loop
start_of_loop = time.time()

# Subsetting the data pr. year
session_data = df[df['Year'] == year][500]
    
# Defining the folds
skf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state = 1234)

# Looping through each fold. 
for i, (train_index, test_index) in enumerate(skf.split(session_data, session_data["Parti"].values)):

    # Start time for fold
    start_fold_time = time.time()

    # Dividing into train and test set
    train = session_data.iloc[train_index]
    test = session_data.iloc[test_index]

    # Giving labels 
    X_train, y_train = train["text"], train["Parti"]
    X_test, y_test = test["text"], test["Parti"]

    # Fitting the grid
    print("Fitting the grid for year: {} and fold {}".format(year, i+1))
    grid.fit(X_train, y_train)

    # save best kappa score and params generating that score
    kappa_train = grid.best_score_
    alpha = grid.best_params_.get("SGD__alpha")

    # metrics on test set
    print("Predicting and extracting scores")
    y_pred = grid.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    kappa_test = cohen_kappa_score(y_test, y_pred)

    # Adding the values to a dictionary
    hep = {"Year": year, "k_fold": i+1, "alpha": alpha, "accuracy_score": accuracy, 
            "precision_score": precision, "recall_score": recall, "kappa_train": kappa_train, "kappa_test": kappa_test,
            "y_test": y_test.tolist(), "y_pred": y_pred}
                          
    # Appending to the final dataframe
    cv_results = cv_results.append(hep, ignore_index = True)
        
    # Printing time stamp
    fold_time = convert(time.time() - start_fold_time)
    print("elapsed time for fold:", fold_time , "\n")
      
# Getting the time
elapsed_time = time.time()
print("Run time for year {}: ".format(year), convert(elapsed_time - start_of_loop), ". Total run time: ", convert(elapsed_time - start_of_loop), "\n")

cv_results_2010 = cv_results
print("FINISH")

#files.download("cv_results_2010.csv")