In [1]:
# General
import pandas as pd
import numpy as np
import os
import glob
import re

# Plotting
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt

In [3]:
# Read in data
df = pd.read_csv("data/predicted_diffs.csv")

# Relabel IDACI quantiles to make plotting easier
pat = r"(\w*\S*X*\S*)(low|medium|high)(.*)"
df.loc[df.factor.str.contains("idaci_quantile"), "factor_value"] = df.loc[df.factor.str.contains("idaci_quantile"),
                                                                         "factor_value"].apply(lambda x: re.sub(pat, r"\1\2 IDACI\3", x))
# Save to csv, needed in this format for R plotting
df.to_csv("data/predicted_diffs-processed.csv", index = False)

# Prep for BigQuery
# Tidy up category names and values
df.factor = df.factor.str.replace("_|X", " ", regex = True).str.title()
df.factor_value = df.factor_value.str.title()
var_mapping = {"Fsm":"FSM", "Sen":"SEN", "Eal":"EAL", "Idaci":"IDACI"}
df = df.replace(var_mapping, regex = True)
cat_mapping = {'Whit':'White','Blac':'Black', 'Asia':'Asian',
               "Chin":"Chinese", "Mixd":"Mixed", "Aoeg":"AOEG"}
df = df.replace(cat_mapping, regex = True)
# Get neater column names
df.columns = ["Variable", "Category", "CATE", "Modelled GCSE Points", 
             "Modelled GCSE Points (std.)", "CAG Points", "CAG Points (std.)",
             "Welch's p-value", "Num. of Obs."]
# Save to csv
df.to_csv("data/bigquery_data.csv", index = False)