In [None]:
import pandas as pd

df = pd.read_csv('../data/demo_data.csv')
df['Announced']= pd.to_datetime(df['Announced at'], format='%m/%d/%Y %H:%M')
df['Applied']= pd.to_datetime(df['Applied at'], format='%m/%d/%Y %H:%M')
df['Time_Elapsed'] = df['Applied'] - df['Announced']
df.drop(columns = {"Announced at", "Applied at"}, inplace = True)
df['Age'] = df['Age'].str.split(r'[.\s]+', expand=True)[0]
df.rename(columns={'Highest Education': 'Education'}, inplace = True)

In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
blue_variance = ["#3333FF", "#6666FF", "#9999FF", "#CCCCFF", 
                 "#E6E6FF", "#0000FF", "#0000CC", "#000099", "#000066", "#000033"]


def draw_one_feature(df,feature):
    df1 = df[feature].value_counts(normalize=True).to_frame()
    df1.rename(columns={'proportion': feature}, inplace = True)
    
    # Create pie charts
    fig = px.pie(df1, values=feature, names=df1.index, hole=0.4, 
                color_discrete_sequence=blue_variance, width=800, height=600)
    fig.update_traces(textinfo='percent+label')

    fig.update_layout(annotations=[dict(text= feature + ' Distribution', x=0.5, y=0.5, font_size=18, showarrow=False)])
    fig.update_layout(showlegend=False, font_size = 16,   
                    margin=dict(l=0,r=0,b=2,t=0))
    fname = '../results/'+feature+'_ind.png'
    fig.write_image(fname, width=800, height=600, scale=2)
    fig.show()

In [None]:
#%pip install -U kaleido
features = list(df.columns)[1:4]
for feature in features:
    draw_one_feature(df,feature)

In [None]:

import seaborn as sns
def result_graph(df,feature, sort_by_feature):
    grouped_data = df.groupby(feature)['Entranced'].agg(Total='count', Yes=lambda x: (x == 'Yes').sum())
    grouped_data['Percentage'] = (100*grouped_data['Yes']/grouped_data['Total']).round(1)
    grouped_data.sort_values(by = sort_by_feature, inplace = True)
    grouped_data.reset_index(inplace = True)
    sns.set(rc={'figure.figsize':(16,10)})
    ax = sns.barplot(x='Total', y=feature, data=grouped_data, color="#CCCCFF", label='Total')
    ax = sns.barplot(x='Yes', y=feature, data=grouped_data, color='green', label='Take Exam')
    num_locations = grouped_data.shape[0]
    for idx, p in enumerate(ax.patches[num_locations:]):    
        width = p.get_width()    
        ax.annotate(f'{grouped_data["Percentage"][idx]}%', 
                    xy=(width, p.get_y() + p.get_height() / 2), 
                    xytext=(15, 0), fontsize = 14,
                    textcoords='offset points', 
                    ha='left', 
                    va='center', 
                    color='black')
    plt.title('Comparison of Students: Entrance Exam Participants vs. Applicants by ' + feature, fontsize = 18)
    plt.legend(fontsize = 18)
    plt.xlabel('')
    plt.ylabel('')
    plt.yticks(rotation=0, fontsize = 16)  
    plt.xticks(rotation=0, fontsize = 16)  
    plt.savefig('../results/'+feature+'.png', bbox_inches='tight', dpi=300)
    plt.show()

In [None]:
result_graph(df,'Age', 'Age')
result_graph(df,'Location', 'Total')
result_graph(df,'Education', 'Total')


In [None]:
def time_foramt_to_total_mins(formatted_time):
    days, time_str = str(formatted_time).split(' days ')
    hours, minutes, _ = map(int, time_str.split(':'))
    total_minutes = int(days) * 24 * 60 + int(hours) * 60 + int(minutes)
    return int(days)+1
df['Time'] = df['Time_Elapsed'].apply(time_foramt_to_total_mins)
grouped_data = df.groupby('Time')['Entranced'].agg(Total='count', Yes=lambda x: (x == 'Yes').sum())
grouped_data['Percentage'] = (100*grouped_data['Yes']/grouped_data['Total']).round(1)
grouped_data.reset_index(inplace = True)
grouped_data.to_csv('../results/time_data.csv')

colors = {'Yes': 'green', 'No': 'red'}
plt.figure(figsize=(18, 6))
plt.scatter(df.index,df['Time'], c=df['Entranced'].map(colors), marker='*')
plt.title('Highest Number of Applicants are observed in Day 1', fontsize = 16)
plt.xlabel('Number of Applicants', fontsize = 20)
plt.yticks(fontsize = 16)
plt.xticks(fontsize = 16)
plt.ylabel('Elapsed Time (days)', fontsize = 16)
plt.legend(handles=[plt.Line2D([0], [0], marker='*', color='w', markerfacecolor='green', markersize=15, label='Yes'),
                    plt.Line2D([0], [0], marker='*', color='w', markerfacecolor='red', markersize=15, label='No')],
           fontsize = 18)
plt.grid(True)
plt.savefig('../results/time_elapsed.png', bbox_inches='tight', dpi=300)
plt.show()