# get_all-features-results.ipynb
This notebook: 
* Calculates the correlations between each set of features.
* Defines the bonferroni-corrected alpha threshold, and filters out correlations which do not meet this criteria.
* Filters out correlations with abs(c) < 0.1.
* Writes the top 10 words for each topic into the df, if a topic variable has a significant correlation. This aids with interpretability of the topic (i.e., Topic 42 is interpreted by its top 10 words, not by "Topic 42"). 

This happens on 2 versions of the df: (1) all the episodes for each show (approx. 80k episodes), (2) one episode per show (approx. 15k episodes).

In [1]:
import pandas as pd
import os
import json
from tqdm import tqdm
from scipy import stats
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

def get_results_df(only_one_episode_per_show=False):
    
    csv_name = "./csv/df-all-features-results.csv"
    fig_name = "./fig-all_episodes.png"

    # read in df
    df = pd.read_csv("./csv/df-all-features.csv", index_col=0)
    # display(df.head())

    if only_one_episode_per_show:
        df = df.drop_duplicates(subset="show_uri", keep="first")
        csv_name = "./csv/df-all-features-one-show-per-episode-results.csv"
        fig_name = "./fig-one_episode.png"
    
    print("df length:", len(df))

    # print column names
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    print("Numeric columns:", numeric_cols)
    print("# of numeric columns:", len(numeric_cols), "\n")

    # calculate correlations and p-values
    results_lists = []
    for i in range(len(numeric_cols)-1):
        for j in range(i+1, len(numeric_cols)):
            attr1 = numeric_cols[i]
            attr2 = numeric_cols[j]
            correlation, pvalue = stats.pearsonr(x=df[attr1], y=df[attr2])
            results_lists.append([attr1, attr2, correlation, pvalue])
    results_df = pd.DataFrame(results_lists, columns=["attr1","attr2","correlation","pvalue"])
    num_tests_run = len(results_df)
    print("number of tests run:", num_tests_run)
    
    # # Create a heatmap
    # numeric_df = df.select_dtypes(include=np.number)
    # plt.figure(figsize=(120,120))
    # sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
    # plt.title('Correlation Heatmap of Numeric Columns')
    # plt.show()
    # plt.savefig(fig_name)

    # filter out correlations with low correlation values
    threshold = 0.1
    results_df = results_df[(results_df["correlation"] >= threshold) | (results_df["correlation"] <= -threshold)]
    
    # then round the correlation values to 2 decimal places
    results_df["correlation"] = results_df["correlation"].round(2)

    # filter out correlations with high p-value
    alpha_adjusted = 0.05/float(num_tests_run)
    print("alpha_adjusted:", alpha_adjusted)
    results_df = results_df[results_df["pvalue"] < alpha_adjusted]

    lda_topics = pd.read_csv("./LDA_topics.csv", header=None)

    def get_top_10_words(topic_num):
        l = ""
        for i in range(1,11):
            l += lda_topics.iloc[topic_num-1,i]  # subtract 1 because the index is 0-indexed, but topics are 1-indexed
            if i != 10:
                l += ", "
            else:
                l += " "
        return l


    results_df["attr1_topic_words"] = ""
    results_df["attr2_topic_words"] = ""

    for index, row in results_df.iterrows():

        if "Topic" in row["attr1"]:
            topic_num = int(row["attr1"].split("_")[1])
            results_df.loc[index,"attr1_topic_words"] = str(topic_num) + ": " + get_top_10_words(topic_num)

        if "Topic" in row["attr2"]:
            topic_num = int(row["attr2"].split("_")[1])
            results_df.loc[index,"attr2_topic_words"] = str(topic_num) + ": " + get_top_10_words(topic_num)

    display(results_df)

    # save
    results_df.to_csv(csv_name, header=True)

In [2]:
get_results_df(only_one_episode_per_show=False)

df length: 82601
Numeric columns: ['duration', 'transcript_length', 'parse_INTJ_count', 'parse_EDITED_count', 'parse_PRN_count', 'parse_ADJP_count', 'parse_ADVP_count', 'parse_NP_count', 'parse_PP_count', 'parse_S_count', 'parse_SBAR_count', 'parse_SBARQ_count', 'parse_SINV_count', 'parse_SQ_count', 'parse_VP_count', 'parse_WHADVP_count', 'parse_WHNP_count', 'parse_WHPP_count', 'parse_X_count', 'female', 'male', 'music', 'noEnergy', 'noise', 'Topic_1_Probability', 'Topic_2_Probability', 'Topic_3_Probability', 'Topic_4_Probability', 'Topic_5_Probability', 'Topic_6_Probability', 'Topic_7_Probability', 'Topic_8_Probability', 'Topic_9_Probability', 'Topic_10_Probability', 'Topic_11_Probability', 'Topic_12_Probability', 'Topic_13_Probability', 'Topic_14_Probability', 'Topic_15_Probability', 'Topic_16_Probability', 'Topic_17_Probability', 'Topic_18_Probability', 'Topic_19_Probability', 'Topic_20_Probability', 'Topic_21_Probability', 'Topic_22_Probability', 'Topic_23_Probability', 'Topic_24_P

Unnamed: 0,attr1,attr2,correlation,pvalue,attr1_topic_words,attr2_topic_words
0,duration,transcript_length,0.14,0.0,,
1,duration,parse_INTJ_count,0.32,0.0,,
2,duration,parse_EDITED_count,0.27,0.0,,
3,duration,parse_PRN_count,0.16,0.0,,
5,duration,parse_ADVP_count,0.16,0.0,,
6,duration,parse_NP_count,0.15,0.0,,
8,duration,parse_S_count,0.1,1.0026380000000001e-191,,
10,duration,parse_SBARQ_count,0.12,1.561062e-250,,
12,duration,parse_SQ_count,0.17,0.0,,
31,duration,Topic_9_Probability,-0.2,0.0,,"9: also, one, things, way, people, important, different, need, time, health"


In [3]:
get_results_df(only_one_episode_per_show=True)

df length: 15117
Numeric columns: ['duration', 'transcript_length', 'parse_INTJ_count', 'parse_EDITED_count', 'parse_PRN_count', 'parse_ADJP_count', 'parse_ADVP_count', 'parse_NP_count', 'parse_PP_count', 'parse_S_count', 'parse_SBAR_count', 'parse_SBARQ_count', 'parse_SINV_count', 'parse_SQ_count', 'parse_VP_count', 'parse_WHADVP_count', 'parse_WHNP_count', 'parse_WHPP_count', 'parse_X_count', 'female', 'male', 'music', 'noEnergy', 'noise', 'Topic_1_Probability', 'Topic_2_Probability', 'Topic_3_Probability', 'Topic_4_Probability', 'Topic_5_Probability', 'Topic_6_Probability', 'Topic_7_Probability', 'Topic_8_Probability', 'Topic_9_Probability', 'Topic_10_Probability', 'Topic_11_Probability', 'Topic_12_Probability', 'Topic_13_Probability', 'Topic_14_Probability', 'Topic_15_Probability', 'Topic_16_Probability', 'Topic_17_Probability', 'Topic_18_Probability', 'Topic_19_Probability', 'Topic_20_Probability', 'Topic_21_Probability', 'Topic_22_Probability', 'Topic_23_Probability', 'Topic_24_P

Unnamed: 0,attr1,attr2,correlation,pvalue,attr1_topic_words,attr2_topic_words
0,duration,transcript_length,0.16,2.294008e-83,,
1,duration,parse_INTJ_count,0.3,8.736999e-311,,
2,duration,parse_EDITED_count,0.26,3.137606e-234,,
3,duration,parse_PRN_count,0.14,1.320171e-67,,
5,duration,parse_ADVP_count,0.15,1.377742e-79,,
6,duration,parse_NP_count,0.16,1.6136529999999999e-87,,
8,duration,parse_S_count,0.11,2.645515e-45,,
10,duration,parse_SBARQ_count,0.1,3.366375e-35,,
12,duration,parse_SQ_count,0.16,8.495998e-83,,
18,duration,female,-0.17,1.19567e-94,,
