## A query with optional results ##
This allows the query to return a result even if it has certain attributes missing.
This returns all the data we're looking for.

```PREFIX schema: <http://schema.org/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX mo: <http://purl.org/ontology/mo/>

SELECT *
WHERE {
	?event a schema:Event .
  	OPTIONAL {
    	?event schema:startDate ?date .
  	}
    OPTIONAL { ?event rdfs:label ?title . }
  	OPTIONAL {
    	?event mo:genre ?genre .
  		FILTER contains(str(?genre), "http://data.carnegiehall.org/genres/")
  		OPTIONAL { ?genre rdfs:label ?genreLabel . }
  	}
    OPTIONAL {
    	?event schema:subEvent/schema:workPerformed ?workperformed .
    	OPTIONAL { ?workperformed rdfs:label ?work . }
  		OPTIONAL {
    		?workperformed dcterms:creator ?composer .
      	OPTIONAL { ?composer schema:name ?composerName . }
  			OPTIONAL {
    			?composer skos:exactMatch ?skos .
    			FILTER contains(str(?skos), "wiki") 
  			}
  		}
  	}
    # An optional filter to let us select the years for which we query performances:
    # FILTER (year(?date) >= 1970 && year(?date) < 1980)
}

# Run This

In [1]:
import pandas as pd
import plotly.express as px
import os
import pickle

# Also required:
# • A csv with all the event data returned by the SPARQL query
# • A csv with that matches the wikidata IDs for each composer to their nationality, from a previously-ran series of API calls

# Run This

In [54]:
def add_to_event_info(column_name: str, item: str, event_df, base_df, row):
    '''adds a particular attribute to the subdataframe'''

    event_df[column_name] = base_df[base_df['event'] == row['event']][item]
    return event_df

def create_event_data(items_to_add, base_df, row, index):
    pos = index / 373232
    if 0.99 < pos < 0.9901:
        print('Almost done!')
    elif 0.9 < pos < 0.9001:
        print('90%')
    elif 0.8 < pos < 0.8001:
        print('80%')
    elif 0.7 < pos < 0.7001:
        print('70%')
    elif 0.6 < pos < 0.6001:
        print('60%')
    elif 0.5 < pos < 0.5001:
        print('50%')
    elif 0.4 < pos < 0.4001:
        print('40%')
    elif 0.3 < pos < 0.3001:
        print('30%')
    elif 0.2 < pos < 0.2001:
        print('20%')
    elif 0.1 < pos < 0.1001:
        print('10%')
    elif 0.01 < pos < 0.0101:
        print('1% done!')
    elif 0.001 < pos < 0.0011:
        print('0.1% done!')
    event_data = pd.DataFrame()
    for item in items_to_add:
        event_data = add_to_event_info(item, item, event_data, base_df, row)
    return event_data

def cleaner(x, items_to_add):
    print('Step 1: Create event data')
    x['event_data'] = [create_event_data(items_to_add, x, row, index) for index, row in x.iterrows()]
    
    print('Step 2: Get rid of duplicate rows')
    events = []
    for index, row in x.iterrows():
        pos = index / 373232
        if 0.99 < pos < 0.9901:
            print('Almost done!')
        elif 0.9 < pos < 0.9001:
            print('90%')
        elif 0.8 < pos < 0.8001:
            print('80%')
        elif 0.7 < pos < 0.7001:
            print('70%')
        elif 0.6 < pos < 0.6001:
            print('60%')
        elif 0.5 < pos < 0.5001:
            print('50%')
        elif 0.4 < pos < 0.4001:
            print('40%')
        elif 0.3 < pos < 0.3001:
            print('30%')
        elif 0.2 < pos < 0.2001:
            print('20%')
        elif 0.1 < pos < 0.1001:
            print('10%')
        elif 0.01 < pos < 0.0101:
            print('1% done!')
        elif 0.001 < pos < 0.0011:
            print('0.1% done!')
        # get rid of multiple rows for the same event
        if row['event'] in events:
            x = x.drop(index)
        else:
            events.append(row['event'])
    
    print('Step 3: Adding additional columns')
    # creates a year column
    x['year'] = [int(date[:4]) for date in x['date']]
    
    x['genreLabel'] = x['genreLabel'].str.lower()
    
    print('DONE')

    return x

# Run This

In [20]:
def create_event_frequency_list(df, lookup_range, column, specific_value, normalize=False):
    column_key = {'Genre': 'genreLabel', 'Nationality': 'nationalities', 'Work': 'workperformed', 'Composer': 'composer'}
    column = column_key[column]
    if column == 'genreLabel':
        specific_value = specific_value.lower()
    elif column in ('workperformed', 'composer'):
        specific_value = specific_value[specific_value.index('#') + 1:specific_value.index(')')]
    frequency_list = []
    for year in lookup_range:

        # if it has to do with works and not events (composer, nationality, etc)
        if column in df['event_data'][0].columns:

            # create smaller dataframe with events only in that year to simplify
            sub_df = df[df['year'] == year].copy()
            # print(sub_df['event_data'])

            # create a boolean column for whether the specific value can be found in a work performed at that event
            has_value = []

            # iterate through events in specific year
            for index, row in sub_df.iterrows():
                # isolate the entry in the column for the current event
                important_column = sub_df['event_data'][index][column]

                # check if the desired value is present in the column entry
                # for nationalities, need to check if any of the desired nations are present in the column
                if column == 'nationalities':
                    # assume not present
                    any_nationality_present = False
                    # iterate through nations
                    for nation in specific_value:
                        # if a matching nation is found
                        if f"{nation}" in important_column.to_string():
                            # mark that this nationality group is present for this event
                            any_nationality_present = True
                            # stop searching for matches in this event
                            break
                    # add the boolean storing whether this nationality group was present for this event
                    has_value.append(any_nationality_present)
                else:
                    # add the boolean storing whether the desired value was present for this event
                    has_value.append(f"{specific_value}" in important_column.to_string())

            if column == 'nationalities':
                sub_df[specific_value[0]] = has_value
            else:
                sub_df[specific_value] = has_value
            #             sub_df[specific_value] = [specific_value in sub_df['event_data'][index][column].to_list() for index, row in sub_df.iterrows()]

            # create the frequency list
            try:  # if the desired event has occurred in this year, add the number of times it occured
                if column == 'nationalities':
                    frequency_list.append(sub_df.value_counts(specific_value[0], normalize=normalize).to_dict()[True])
                else:
                    frequency_list.append(sub_df.value_counts(specific_value, normalize=normalize).to_dict()[True])
            except KeyError:  # if the desired event has not occurred in this year
                frequency_list.append(0)

        #             # remove the extra column
        #             sub_df.drop(columns=[specific_value])

        # if it has to do with events (genre, work, etc)
        else:
            attribute_counts = df[df['year'] == year].value_counts(column, normalize=normalize)

            # getting the count for the specific value
            try:
                if column == 'nationalities':
                    frequency_list.append(attribute_counts[specific_value[0]])
                else:
                    frequency_list.append(attribute_counts[specific_value])
            except KeyError:
                frequency_list.append(0)

    return frequency_list


def make_bar_chart(df, column, specific_value, lookup_range=(0, 0), normalize=False):
    """make a bar chart of the frequency of "specific_value", which is a value in "column" over "lookup_range" years"""
    # Create a DataFrame for bar chart

    # years is the x-axis
    # this first if statement allows you to make the chart for a subset of the years
    if lookup_range != (0, 0):
        years = []
        for year in range(lookup_range[0], lookup_range[1] + 1):
            years.append(year)

    else:
        years = list(set(df['year'].to_list()))


    # list of frequencies
    frequency = create_event_frequency_list(df, years, column, specific_value, normalize)


    bar_data = {'Years': years,
                'frequency': frequency}
    df_bar = pd.DataFrame(bar_data)

    # The barchart with Plotly Express specifying the source df, the columns to use as x and y axes,
    # labels to use for those axes, and an overall title for the figure

    fig = px.bar(df_bar,
                 x = 'Years', y= 'frequency',
                 labels={'Years': 'Years', 'frequency': f'Performances of {column.title()}: {specific_value}'},
                 title=f'Performances of {column.title()}: {specific_value} by Year',
                )
    # Set width and height in pixels
    fig.update_layout(width=600, height=400)
    fig.show()

pickle_data = pd.read_pickle('testPickle.pkl')
make_bar_chart(pickle_data, 'Nationality', ['Germany', 'Kingdom of Germany'], normalize=True)

# Run this

In [36]:
def add_nationalities(input_df):
    """Combines nationality data csv and carnegie hall data csv together"""
    input_df.insert(6, "nationalities", pd.Series(dtype=str))
    
    names_with_nationalities = pd.read_csv('CarnegieData/nationalities_new.csv')
    
    # Iterate through the dataframe, adding nationalities when possible
    for index in input_df.index:
        nationalities = names_with_nationalities.loc[names_with_nationalities['composer'] == input_df.loc[index, 'composer']]
        nationalities = nationalities['nationalities']
        try:
            nationalities = nationalities.get(nationalities.keys()[0])
        except IndexError:
            nationalities = 'statelessness'
        try:
            input_df.at[index, 'nationalities'] = nationalities
        except KeyError:
            print(input_df.loc[index])
    
    return input_df

In [37]:
df = pd.read_csv("events_list.csv")
# do this for whatever columns you want to add to the event dataframe (nationality, name or work performed, etc)
# they just need to be the exact names
items_to_add = ['workperformed', 'composerName', 'nationalities']

In [0]:
# add comments
df = add_nationalities(df)

In [55]:
# This line of code is the most computationally intensive, and can take many minutes to run.
data = cleaner(df, items_to_add)

Step 1: Create event data
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
0.1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
1% done!
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20

In [56]:
data.to_pickle('finalPickle.pkl')

In [1]:
# do this with the exact column name and specific value you want to create a graph for. see examples above
pickle_data = pd.read_pickle('finalPickle.pkl')
make_bar_chart(pickle_data, 'Work', '#54128)')

NameError: name 'pd' is not defined

In [None]:
def app(dataframe_source, column, value, items_to_add_param = ('workperformed', 'composerName', 'nationalities')):
    inner_df = pd.read_csv(dataframe_source)
    # do this for whatever columns you want to add to the event dataframe (nationality, name or work performed, etc)
    # they just need to be the exact names

    inner_df = add_nationalities(inner_df)
    # This line of code is the most computationally intensive, and can take many minutes to run.
    inner_data = cleaner(inner_df, list(items_to_add))
    make_bar_chart(inner_data, column, value)
    

## Combining datasets

If SPARQL query limits mean that the necessary data is stored in many disparate csv files, this code will take the path to the folder containing all those csvs (and nothing else!!) and the desired file path of the resultant csv that will concatenate all of the smaller csvs.

In [21]:
# code to combine several downloaded sparql query results

def combine_csvs(folder_path, file_name):
    """
    Combines several csvs into one large one
    :param folder_path: path to the folder containing all the csvs to be contained and NOTHING ELSE
    :param file_name: path to the file that will store the combined csv, including .csv at the end!
    :return: 
    """
    files = os.listdir(folder_path)
    
    df_list = []
    for csv in files:
        file_path = os.path.join(folder_path, csv)
        csv_df = pd.read_csv(file_path)
        df_list.append(csv_df)
    
    big_df = pd.concat(df_list, ignore_index=True)
    big_df.to_csv(file_name)

In [22]:
combine_csvs('CarnegieData/AllEvents', 'events_list.csv')