# get_all-features-df.ipynb
This notebook creates 1 large df with all of the features created by the various models and packages. 

In [1]:
import pandas as pd
import os
import json
from tqdm import tqdm
from scipy import stats

# set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# english-fisher-annotations dataframe
dtype_dict = {"parse": str, "orig_dys": str, "dys": str}
parse_df = pd.read_csv("./csv/df-parse.csv", dtype=dtype_dict)

# inaSpeechSegmenter dataframe
df1 = pd.read_csv("./csv/df-inaSpeechSegmenter-4-0.csv")
df2 = pd.read_csv("./csv/df-inaSpeechSegmenter-4-1.csv")
df3 = pd.read_csv("./csv/df-inaSpeechSegmenter-4-2.csv")
df4 = pd.read_csv("./csv/df-inaSpeechSegmenter-4-3.csv")
inaSpeechSegmenter_df = pd.concat([df1, df2, df3, df4])

# topics dataframe
topics_df = pd.read_csv("./csv/df-LDA_topics.csv")

# merge all the dataframes together
common_columns = ["show_uri", "show_name", "show_description", "publisher", "rss_link", "episode_uri", "episode_name", "episode_description", "duration", "show_filename_prefix", "episode_filename_prefix", "language", "transcript", "transcript_length"]
df = pd.merge(parse_df, inaSpeechSegmenter_df, on=common_columns, how="outer")
df = pd.merge(df, topics_df, on=common_columns, how="outer")

# drop columns containing 'Unnamed' in their name because these are old indexes
columns_to_drop = [col for col in df.columns if "Unnamed" in col]
df = df.drop(columns=columns_to_drop)

# if no words were transcribed, replace nan's with empty string
df["transcript"] = df["transcript"].fillna("")

# save
df.to_csv("./csv/df-all-features.csv", header=True)

display(df.head())

Unnamed: 0,show_uri,show_name,show_description,publisher,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,language,transcript,transcript_length,parse,orig_dys,dys,parse_INTJ_count,parse_EDITED_count,parse_PRN_count,parse_ADJP_count,parse_ADVP_count,parse_NP_count,parse_PP_count,parse_S_count,parse_SBAR_count,parse_SBARQ_count,parse_SINV_count,parse_SQ_count,parse_VP_count,parse_WHADVP_count,parse_WHNP_count,parse_WHPP_count,parse_X_count,segmentation,female,male,music,noEnergy,noise,Topic_1_Probability,Topic_2_Probability,Topic_3_Probability,Topic_4_Probability,Topic_5_Probability,Topic_6_Probability,Topic_7_Probability,Topic_8_Probability,Topic_9_Probability,Topic_10_Probability,Topic_11_Probability,Topic_12_Probability,Topic_13_Probability,Topic_14_Probability,Topic_15_Probability,Topic_16_Probability,Topic_17_Probability,Topic_18_Probability,Topic_19_Probability,Topic_20_Probability,Topic_21_Probability,Topic_22_Probability,Topic_23_Probability,Topic_24_Probability,Topic_25_Probability,Topic_26_Probability,Topic_27_Probability,Topic_28_Probability,Topic_29_Probability,Topic_30_Probability,Topic_31_Probability,Topic_32_Probability,Topic_33_Probability,Topic_34_Probability,Topic_35_Probability,Topic_36_Probability,Topic_37_Probability,Topic_38_Probability,Topic_39_Probability,Topic_40_Probability,Topic_41_Probability,Topic_42_Probability,Topic_43_Probability,Topic_44_Probability,Topic_45_Probability,Topic_46_Probability,Topic_47_Probability,Topic_48_Probability,Topic_49_Probability,Topic_50_Probability,Topic_51_Probability,Topic_52_Probability,Topic_53_Probability,Topic_54_Probability,Topic_55_Probability,Topic_56_Probability,Topic_57_Probability,Topic_58_Probability,Topic_59_Probability,Topic_60_Probability,Topic_61_Probability,Topic_62_Probability,Topic_63_Probability,Topic_64_Probability,Topic_65_Probability,Topic_66_Probability,Topic_67_Probability,Topic_68_Probability,Topic_69_Probability,Topic_70_Probability,Topic_71_Probability,Topic_72_Probability,Topic_73_Probability,Topic_74_Probability,Topic_75_Probability,Topic_76_Probability,Topic_77_Probability,Topic_78_Probability,Topic_79_Probability,Topic_80_Probability,Topic_81_Probability,Topic_82_Probability,Topic_83_Probability,Topic_84_Probability,Topic_85_Probability,Topic_86_Probability,Topic_87_Probability,Topic_88_Probability,Topic_89_Probability,Topic_90_Probability,Topic_91_Probability,Topic_92_Probability,Topic_93_Probability,Topic_94_Probability,Topic_95_Probability,Topic_96_Probability,Topic_97_Probability,Topic_98_Probability,Topic_99_Probability,Topic_100_Probability
0,spotify:show:2NYtxEZyYelR6RMKmjfPLB,Kream in your Koffee,A 20-something blunt female takes on the world...,Katie Houle,https://anchor.fm/s/11b84b68/podcast/rss,spotify:episode:000A9sRBYdVh66csG2qEdj,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,12.700133,show_2NYtxEZyYelR6RMKmjfPLB,000A9sRBYdVh66csG2qEdj,en,"Hello, hello, hello everyone. This is Katie an...",1716,(FRAG (INTJ (UNK hello)) (INTJ (UNK hello)) (I...,hello _ hello _ hello _ everyone _\nthis _ is ...,hello E hello E hello E everyone _ this _ is _...,19,12,8,29,104,609,136,405,109,5,1,4,435,15,37,0,0,"[('music', 0.0, 2.44), ('female', 2.44, 26.740...",26.84,0.0,2.44,0.7,0.0,1.2e-05,1.2e-05,0.002161,0.002365,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.034438,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.001214,1.2e-05,1.2e-05,0.003613,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.001192,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.030293,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.008942,0.263176,1.2e-05,0.360638,1.2e-05,1.2e-05,0.004425,1.2e-05,1.2e-05,0.007946,1.2e-05,1.2e-05,0.225073,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.001251,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.052268
1,spotify:show:6vZRgUFTYwbAA79UNCADr4,Inside The 18 : A Podcast for Goalkeepers by G...,Inside the 18 is your source for all things Go...,Inside the 18 GK Media,https://anchor.fm/s/81a072c/podcast/rss,spotify:episode:001UfOruzkA3Bn1SPjcdfa,Ep.36 - Incorporating a Singular Goalkeeping C...,Today’s episode is a sit down Michael and Omar...,43.616333,show_6vZRgUFTYwbAA79UNCADr4,001UfOruzkA3Bn1SPjcdfa,en,Welcome to Inside the 18. Today's episode is a...,2017,(ADJP (UNK welcome) (PP (UNK to) (PP (UNK insi...,welcome _ to _ inside _ the _\ntoday _ 's _ ep...,welcome _ to _ inside _ the _ today _ 's _ epi...,34,16,18,48,106,734,206,467,115,11,2,12,483,15,32,0,8,"[('noEnergy', 0.0, 0.2), ('male', 0.2, 29.98)]",0.0,29.78,0.0,0.2,0.0,1.1e-05,1.1e-05,0.005914,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,0.017988,1.1e-05,1.1e-05,1.1e-05,0.125472,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,0.001155,0.003559,1.1e-05,1.1e-05,0.015363,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,0.001143,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,0.103951,1.1e-05,1.1e-05,1.1e-05,1.1e-05,0.046425,1.1e-05,1.1e-05,1.1e-05,0.005094,1.1e-05,0.060959,1.1e-05,0.22217,0.007833,0.001051,1.1e-05,1.1e-05,0.309811,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,0.005237,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,0.00586,1.1e-05,1.1e-05,1.1e-05,1.1e-05,0.060087
2,spotify:show:5BvKEjaMSuvUsGROGi2S7s,Arrowhead Live!,Your favorite podcast for everything @Chiefs! ...,Arrowhead Live!,https://anchor.fm/s/917dba4/podcast/rss,spotify:episode:001i89SvIQgDuuyC53hfBm,Episode 1: Arrowhead Live! Debut,Join us as we take a look at all current Chief...,58.1892,show_5BvKEjaMSuvUsGROGi2S7s,001i89SvIQgDuuyC53hfBm,en,"Hey Cheese fans! Before we get started, I want...",1518,(FRAG (INTJ (UNK hey)) (NP (UNK cheese) (UNK f...,hey _ cheese _ fans _\nbefore _ we _ get _ sta...,hey E cheese _ fans _ before _ we _ get _ star...,33,23,16,31,90,532,127,365,82,7,0,7,389,15,17,0,1,"[('noEnergy', 0.0, 0.84), ('male', 0.84, 11.6)...",0.0,28.68,0.0,1.3,0.0,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.017489,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.016417,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.375438,1.5e-05,1.5e-05,1.5e-05,0.015987,0.176725,1.5e-05,1.5e-05,1.5e-05,0.010951,1.5e-05,0.214406,1.5e-05,0.028262,1.5e-05,0.007246,1.5e-05,1.5e-05,0.039921,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.004716,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.013284,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.077872
3,spotify:show:7w3h3umpH74veEJcbE6xf4,FBoL,"The comedy podcast about toxic characters, wri...",Emily Edwards,https://www.fuckboisoflit.com/episodes?format=rss,spotify:episode:0025RWNwe2lnp6HcnfzwzG,"The Lion, The Witch, And The Wardrobe - Ashley...",The modern morality tail of how to stay good f...,51.78205,show_7w3h3umpH74veEJcbE6xf4,0025RWNwe2lnp6HcnfzwzG,en,"Sorry to interrupt the show, but I do have to ...",1707,(S (ADJP (UNK sorry) (S (VP (UNK to) (VP (UNK ...,sorry _ to _ interrupt _ the _ show _ but _ i ...,sorry _ to _ interrupt _ the _ show _ but _ i ...,66,20,7,53,105,626,145,383,86,5,0,6,426,13,19,0,3,"[('noEnergy', 0.0, 2.06), ('female', 2.06, 6.4...",25.86,0.0,0.0,4.14,0.0,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,0.010266,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,0.024359,1.3e-05,1.3e-05,1.3e-05,0.007594,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,0.127155,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,0.018362,1.3e-05,1.3e-05,1.3e-05,0.010327,0.017344,0.004134,1.3e-05,1.3e-05,0.264712,0.017538,1.3e-05,1.3e-05,0.002205,1.3e-05,1.3e-05,1.3e-05,1.3e-05,0.105184,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,0.113735,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,0.020751,1.3e-05,1.3e-05,1.3e-05,1.3e-05,0.255202
4,spotify:show:5ljREb8VLogQLT7AKGwav1,UPSC Podcasts,Podcasts useful for UPSC aspirants! Mainly dis...,UPSC Podcast,https://anchor.fm/s/8afceec/podcast/rss,spotify:episode:0025w0gdgkl11Nzkmg1wnm,Tourism in India : Opportunities and Challenges,.,13.788,show_5ljREb8VLogQLT7AKGwav1,0025w0gdgkl11Nzkmg1wnm,en,This is All India Radio. In the program Spotli...,1755,(S (NP (UNK this)) (VP (UNK is) (NP (UNK all) ...,this _ is _ all _ india _ radio _\nin _ the _ ...,this _ is _ all _ india _ radio _ in _ the _ p...,9,10,1,29,56,647,189,289,74,3,0,3,325,10,24,0,0,"[('female', 0.0, 5.62), ('noEnergy', 5.62, 6.0...",18.2,10.62,0.0,1.18,0.0,1.2e-05,1.2e-05,0.001215,0.015956,0.001344,0.001357,0.006328,0.002416,0.060408,0.008331,0.020441,0.076749,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.006456,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.014816,0.035791,1.2e-05,1.2e-05,0.00424,1.2e-05,0.005102,0.105217,1.2e-05,1.2e-05,0.52437,1.2e-05,1.2e-05,1.2e-05,0.081295,1.2e-05,1.2e-05,1.2e-05,0.006944,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.001214,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.001214,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.017861,1.2e-05,1.2e-05,1.2e-05,1.2e-05,1.2e-05
