In [60]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt

In [61]:
### Define column headers for csv export
columns_21 = ['Event', 'School', 'TEA', 'City', 'Directors', 'Conference', 'Classification', 'Year', 'ID', 
    'Stage Judge 1', 'Stage Judge 2', 'Stage Judge 3', 'Stage Final', 
    'SR Judge 1', 'SR Judge 2', 'SR Judge 3', 'SR Final', 'Award', 
    'Selection 1', 'Selection 2', 'Selection 3', 'Date', 'Region', 'cj1', 'cj2', 'cj3', 'srj1', 'srj2', 'srj3']
columns_22 = ['Event', 'School', 'TEA', 'City', 'Directors', 'Conference', 'Classification', 'Year', 'ID', 
    'Stage Judge 1', 'Stage Judge 2', 'Stage Judge 3', 'Stage Final', 
    'SR Judge 1', 'SR Judge 2', 'SR Judge 3', 'SR Final', 'Award', 
    'Selection 1', 'Selection 2', 'Selection 3', 'Date', 'Region', 'cj1', 'cj2', 'cj3', 'srj1', 'srj2', 'srj3', 'oops']

## Group the CSVs together, and fix rows with Accompanist error

In [62]:
# list of merged files returned
files = glob.glob("full_run/*.csv")

# joining files with concat and read_csv
df = pd.concat(map(pd.read_csv, files), ignore_index=True)
df = pd.DataFrame(df)

# drop unnamed column
df.drop(columns=['Unnamed: 0'], inplace=True)

# name the columns
df.columns = columns_22

In [63]:
# select rows with acc column 
oops_df = df[df['oops'].notnull()]

# delete the acc columns and fix column names
oops_fix_df = oops_df.drop(columns=['Conference'])
oops_fix_df.columns = columns_21

# drop acc rows from df
df = df.loc[df['oops'].isnull() == True]

# add fixed df to df
df = pd.concat([df, oops_fix_df], ignore_index=True)

# drop oops column
df.drop(columns=['oops'], inplace=True)

# drop rows where Event contains '9'
df = df[df['Event'].str.contains('9') == False]

df.describe()

Unnamed: 0,Event,School,TEA,City,Directors,Conference,Classification,Year,ID,Stage Judge 1,...,Selection 2,Selection 3,Date,Region,cj1,cj2,cj3,srj1,srj2,srj3
count,60803,60803,60803,60803,60803,60803,60803,60803,60803,60803,...,60803,60803,60803,60803,60803,60803,60803,60803,60803,60625
unique,1,6236,2373,1498,24456,28,39,210,60631,13,...,8975,11894,499,155,1116,1219,1196,1230,1344,1203
top,100-Concert Band,Allen High School,TEA:,San Antonio,Joe Martinez,CC,Varsity,2019,1,1,...,Colliding Visions (Balmages/ ),"Moscow, 1941 (Balmages)",DATE of EVENT 04/17/2018,Region: 8,1. Keith Bearden,2. Cindy Lansford,3. Randy Vaughn,1. Phil Anthony,2. Tye Ann Payne,3. Rick Yancey
freq,60803,112,56877,2882,29,14895,31202,3274,78,25874,...,422,446,714,3135,709,463,843,448,341,736


In [64]:
# select rows where selection 3 is null
df_clean = df
df_clean = df_clean[df_clean['Selection 3'].isnull() == False]
len(df_clean)

60803

In [65]:
# Trim whitespace, double spaces, commas, and periods from selections
selection_columns = ['Selection 1', 'Selection 2', 'Selection 3']

for i in selection_columns:
    df_clean[i] = df_clean[i].str.strip()
    df_clean[i] = df_clean[i].str.replace('  ', ' ', regex=False)
    df_clean[i] = df_clean[i].str.replace(',', '', regex=False)
    df_clean[i] = df_clean[i].str.replace('.', '', regex=False)

# Trim whitespace from classification column
df_clean['Classification'] = df_clean['Classification'].str.strip()

# Remove composer/arranger information from selections
for column in selection_columns:
    # remove all inside parenthesis
    df_clean[column] = df_clean[column].str.replace('\(.*\)', '', regex=True)
    # trim whitespace
    df_clean[column] = df_clean[column].str.strip()

In [66]:
# drop rows where conference contains 'Acc'
df_clean = df_clean[df_clean['Conference'].str.contains('Acc') == False]


In [67]:
# Fix names of some conferences
df_clean['Conference'] = df_clean['Conference'].replace('2C', 'CC')
df_clean['Conference'] = df_clean['Conference'].replace('4A', 'AAAA')
df_clean['Conference'] = df_clean['Conference'].replace('cc', 'CC')
df_clean['Conference'] = df_clean['Conference'].replace('1C', 'C')

In [68]:
# Make df_clean column integers
df_clean['Year'] = df_clean['Year'].astype(int)

In [69]:
print(df_clean['Conference'].unique())
print(df_clean['Classification'].unique())
print(df_clean['Year'].unique())

['Varsity ' 'CC' 'AAA' 'C' 'CCC' 'AAAA' 'AA' 'A' 'AAAAA' 'Non-Varsity '
 'Sub Non-Varsity ' 'Sub Non-Varsity B' 'BBB' 'Non-Varsity A' 'BB' 'B'
 'Var-Composite ' ' ' 'Non-Varsity C' 'NVar-Composite ' 'Var-Combined '
 'AAAAAA']
['2005' 'Varsity' 'Non-Varsity' 'Non-Varsity A' 'Non-Varsity B' 'Combined'
 'Sub Non-Varsity' 'Sub Non-Varsity C' 'Sub Non-Varsity B'
 'Sub Non-Varsity A' 'Non-Varsity C' 'Non-Varsity F' 'Sub Non-Varsity D'
 'Sub Non-Varsity E' 'Non-Varsity E' '2006' 'Non-Varsity D' '2008'
 'Var-Combined' 'Var-Composite' '' '2009' 'NVar-Composite' '2010' '2011'
 '2012' 'NVar-Combined' '2013' 'Sub Non-Varsity F' '2014' 'Varsity A'
 'Varsity C' 'Varsity B' '2015' 'Sub Non-Varsity G' 'Var-Composite A'
 'Sub Non-Varsity H' 'Sub Non-Varsity I' 'NVar-Composite A']
[ 17246   2005  14077  11197  11109  17900  16404  17069  17328  13344
  15094  13014  17315  17648  18475  18405  17673  14770  13205  13902
  15470  15934  13485  16213  15350  15679  15324  15314  15307  15306
  15299  1528

In [70]:
judging_columns = ['Stage Judge 1', 'Stage Judge 2', 'Stage Judge 3', 'Stage Final', 'SR Judge 1', 
                'SR Judge 2', 'SR Judge 3', 'SR Final']
numbers = ['1', '2', '3', '4', '5']

for n in numbers:
    for j in judging_columns:
        df_clean.loc[df_clean[j] == n, j] = int(n)




In [71]:
for j in judging_columns:
    print(df_clean[j].unique())

[' ' 2 1 3 4 'DNA' 5 'DQ']
[' ' 2 1 4 3 'DNA' 5 'DQ']
[' ' 2 3 1 4 'DNA' 'DQ' 5]
[' ' 2 1 3 4 'DNA' 5 'DQ']
[' ' 1 2 3 4 5 'DNA' 'DQ']
[' ' 1 2 3 4 'DNA' 5 'DQ']
[' ' 1 2 3 4 'DNA' 5 'DQ']
[' ' 1 2 3 4 'DNA' 5 'TD' 'TRC' 'C' 'DQ' 'PLQ' 'TRO' 'RM1' 'RMD' 'RM2'
 'RMC' 'A' '-' 11 'SWA' 'NAN' 'B' 'D' 0]


In [72]:
# Trim whitespace, double spaces, commas, and periods from selections
selection_columns = ['Selection 1', 'Selection 2', 'Selection 3']

for i in selection_columns:
    df_clean[i] = df_clean[i].str.strip()
    df_clean[i] = df_clean[i].str.replace('  ', ' ', regex=False)
    df_clean[i] = df_clean[i].str.replace(',', '', regex=False)
    df_clean[i] = df_clean[i].str.replace('.', '', regex=False)

# Trim whitespace from classification column
df_clean['Classification'] = df_clean['Classification'].str.strip()

# Remove composer/arranger information from selections
for column in selection_columns:
    # remove all inside parenthesis
    df_clean[column] = df_clean[column].str.replace('\(.*\)', '', regex=True)
    # trim whitespace
    df_clean[column] = df_clean[column].str.strip()


In [73]:
df_clean.columns = df_clean.columns.str.strip()

In [74]:
# Drop Event and TEA column
df = df.drop(columns=['Event', 'TEA'])
print(len(df))

60803


In [75]:
# Remove DNA, DQ
for i in judging_columns:
    df_clean = df_clean[df_clean[i] != 'DNA']
    df_clean = df_clean[df_clean[i] != 'DQ']

# Convert blanks to nans
for i in judging_columns:
    df_clean[i] = df_clean[i].replace(['', ' '], np.nan)

print(len(df_clean))

60269


In [76]:
# Average together Stage Judge 1, 2, and 3 into a new column

df_clean['Stage Average'] = (df_clean['Stage Judge 1'] + df_clean['Stage Judge 2'] + df_clean['Stage Judge 3']) / 3

In [77]:
# sort df_clean by year
df_clean.sort_values(by=['Year'], inplace=True)

# drop Event column
df_clean = df_clean.drop(columns=['Event'])

# drop TEA column
df_clean = df_clean.drop(columns=['TEA'])

# drop ID column
df_clean = df_clean.drop(columns=['ID'])




In [78]:
# import all of the pml csv files into a dataframe
files = glob.glob("csv_files/pml/*.csv")
pml_df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
# convert pml_df to csv
pml_df.to_csv("csv_files/pml_output.csv")

In [79]:
df_clean['Date'] = df_clean['Date'].str.replace('DATE of EVENT ', '')
judge_name_columns = ['cj1', 'cj2', 'cj3', 'srj1', 'srj2', 'srj3']
for i in judge_name_columns:
    df_clean[i] = df_clean[i].str.replace('1. ', '', regex=False)
    df_clean[i] = df_clean[i].str.replace('2. ', '', regex=False)
    df_clean[i] = df_clean[i].str.replace('3. ', '', regex=False)
    # all lowercase
    df_clean[i] = df_clean[i].str.lower()

In [80]:
# for each judge name column, remove everything after the first comma
for i in judge_name_columns:
    #df_clean[i] = df_clean[i].str.split(',', expand=True)[0]
    #df_clean[i] = df_clean[i].str.split('-', expand=True)[0]
    # trim whitespace
    df_clean[i] = df_clean[i].str.strip()

In [81]:
all_columns = df_clean.columns
# trim whitespace from all columns
for i in all_columns:
    try:
        df_clean[i] = df_clean[i].str.strip()
    except:
        pass

In [82]:
# drop rows where School = 0
df_clean = df_clean[df_clean['School'] != '0']

In [83]:
# get rows where year is greater than 3000
df_clean_yr_error = df_clean[df_clean['Year'] > 3000]

# drop rows where year is greater than 3000
df_clean = df_clean[df_clean['Year'] < 3000]

df_clean_yr_error['Year'] = df_clean_yr_error['Classification']
df_clean_yr_error['Classification'] = df_clean_yr_error['Conference']

# fill conference column with blanks
df_clean_yr_error['Conference'] = ''


In [84]:
# combine df_clean and df_clean_yr_error
df_clean = pd.concat([df_clean, df_clean_yr_error], ignore_index=True)

In [85]:
print(df_clean['Conference'].unique())
print(df_clean['Classification'].unique())
print(df_clean['Year'].unique())

['AAAAA' 'A' 'AA' 'AAA' 'AAAA' 'C' 'CC' 'CCC' 'BBB' 'BB' 'B' 'AAAAAA' '']
['Non-Varsity' 'Varsity' 'Sub Non-Varsity' 'Non-Varsity A'
 'Sub Non-Varsity B' 'Sub Non-Varsity A' 'Sub Non-Varsity C'
 'Sub Non-Varsity E' 'Sub Non-Varsity D' 'Non-Varsity B' 'Non-Varsity E'
 'Non-Varsity C' 'Combined' 'Non-Varsity F' 'Non-Varsity D'
 'Var-Composite' 'Var-Combined' '' 'NVar-Composite' 'NVar-Combined'
 'Sub Non-Varsity F' 'Varsity C' 'Varsity B' 'Varsity A'
 'Sub Non-Varsity G' 'Var-Composite A' 'Sub Non-Varsity H'
 'Sub Non-Varsity I' 'NVar-Composite A']
[2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
 2019 2020 2021 2022 '2008' '2009' '2010' '2011' '2012' '2013' '2014'
 '2015']


In [86]:
# convert to csv
df_clean.to_csv("csv_files/full_output.csv")