In [93]:
import pandas as pd
import json

In [28]:
data = pd.read_csv("../merged_summary_topics.csv")

In [29]:
def get_talk_indices(filename, columns_to_drop=None):
    """Creates a DataFrame where the indices are the topics and there is one column named 'List of Talks' that contains
    a list of the indices (in merged_summary_topics.csv) of the talk for each topic"""
    data = pd.read_csv(filename)
    if columns_to_drop == None:
        columns_to_drop = ['Month_letter', 'Month', 'Kicker', 'Unnamed: 0']
    data.drop(columns=columns_to_drop, inplace=True)
    #for each one-hot encoded topic column, get a list of indices for each talk that has that topic tag
    #a talk with have the topic tag if there is a 1.0 in its topic column
    def talks_func(column):
        return data[column == 1.0].index.tolist()
    
    #get a list of talks indices for each topic and make it a new dataframe
    talks_per_topic = data.apply(talks_func, axis=0).to_frame()
    #rename the column to clarify
    talks_per_topic.rename(columns = {0:'List of Talks'}, inplace=True)
    #get rid of the indices that are not topics
    talks_per_topic.drop(index=['Year', 'Speaker', 'Title', 'File', 'topic_lists'], inplace=True)
    return talks_per_topic 

In [78]:
def get_talks_from_indices(data, talk_indices_per_topic):
    """Creates a dictionary where keys are the topics and values are a dataframe of the talks with that topic tag
    the dataframe consists of columns Year, Speaker, Title, File, Month and topic_lists"""
    #columns to drop from the dataframes to be made
    topic_list = [column for column in data.columns if column not in 
                  ['Year', 'Speaker', 'Title', 'File', 'Month', 'topic_lists']]

    def indices_to_information(topics):
        """Takes in a list of all the topis, then creates a dictionary where the keys are topics
        and the values are dataframes of the talks associated with those topics
        each talk in the dataframe has the information Year, Speaker, Title, File, Month and topic_lists"""
        talk_dfs = {topic: data.iloc[idx].drop(columns=topic_list) 
                    for topic in topics for idx in talk_indices_per_topic.loc[topic]}
        return talk_dfs
    
    topics = talk_indices_per_topic.index
    topic_dfs = indices_to_information(topics)
    return topic_dfs
        
    

In [91]:
talk_indices_per_topic = get_talk_indices("../merged_summary_topics.csv")
#talk_indices_per_topic.to_json('../talk_indices_per_topic.json')

In [97]:
topic_dictionary = get_talks_from_indices(data, talk_indices_per_topic)

def create_topic_data(topic_dictionary):
    """Creates a csv file, topic_name.csv, containing the DataFrame of talks for each topic
    saves in the Topic_Data directory"""
    keys = topic_dictionary.keys()
    file_names = [('_').join(key.split(' ')) for key in topic_dictionary.keys()]
    for key, name in zip(keys, file_names):
        topic_dictionary[key].to_csv(f"../Topic_Data/{name}.csv")


In [98]:
#create_topic_data(topic_dictionary)