In [1]:
import pandas as pd
import numpy as np
import sys
%matplotlib inline

In [2]:
from describe import describe, mycount, mymean, mystd, mysum

In [3]:
def observed_table(df, fields, houses, type_calc):
    observed = {}
    row = {}
    row['Total Courses'] = [0] * (len(houses)+1)
    for field in fields[2:]:
        sum = 0
        observed[field] = []
        for house in houses:
            sum += df[house][field][type_calc]
            observed[field].append(df[house][field][type_calc])    
        observed[field].append(sum)
        row['Total Courses'] = [row['Total Courses'][i] + observed[field][i] for i in range(len(observed[field]))]
    
    observed['Total Courses'] = row['Total Courses']
    observed_df = pd.DataFrame(observed)
    observed_df.index = houses + ['Total Students']
    return (observed_df)

In [4]:
if __name__ == "__main__":
    sys.argv[1] = "datasets/dataset_train.csv"
    if len(sys.argv) < 2:
        print("USAGE: histogram.py <csv files>")
    else:
        df = pd.read_csv(sys.argv[1])
        houses = df['Hogwarts House'].unique().tolist()
        houses.sort()        
        if (houses != ['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin']):
            print("data is corrupted")
            
        house_data = {}
        homogeneity_data = {}
        for house in houses:
            homogeneity_data[house] = df.loc[df['Hogwarts House'] == house]
            numeric_features = []
            for c in df.columns:
                col_type = df[c].dtype
                if col_type == float or col_type == int:
                    numeric_features.append(c)
            describe_df = describe(homogeneity_data[house], numeric_features)
            house_data[house] = describe_df
        
        #Total all house:
        numeric_features = []
        for c in df.columns:
            col_type = df[c].dtype
            if col_type == float or col_type == int:
                numeric_features.append(c)
        describe_df = describe(df, numeric_features)
        house_data['Total Student'] = describe_df
        print(house_data)
        observed_df = observed_table(house_data, numeric_features, houses, 'sum')


{'Gryffindor':                Index    Arithmancy      Astronomy    Herbology  \
count     327.000000  3.180000e+02     323.000000   320.000000   
mean      773.773700  4.912200e+04     493.336646    -4.783906   
std       460.012057  1.516860e+04     195.310218     2.249191   
min         3.000000 -4.491000e+03    -622.924308    -9.142445   
25%       382.500000  3.805075e+04     407.222386    -6.142379   
50%       774.000000  4.843400e+04     504.370715    -5.021176   
75%      1153.500000  6.012000e+04     624.239863    -3.742059   
max      1597.000000  9.161400e+04    1016.211940     7.641342   
sum    253024.000000  1.562080e+07  159347.736787 -1530.849874   

       Defense Against the Dark Arts   Divination  Muggle Studies  \
count                     321.000000   321.000000      317.000000   
mean                       -4.948000     4.901349     -501.483149   
std                         1.948529     1.452449      231.878934   
min                       -10.162119     0.05200

In [5]:
subjects = ['Arithmancy', 'Astronomy', 'Herbology',
           'Defense Against the Dark Arts', 'Divination', 'Muggle Studies',
       'Ancient Runes', 'History of Magic', 'Transfiguration', 'Potions',
       'Care of Magical Creatures', 'Charms', 'Flying']
Notes2 = df.melt(id_vars = ['Index','Hogwarts House'], value_vars = subjects, var_name = 'Subject', value_name='Grades')

In [6]:
len(houses)

4

In [7]:
house_data

{'Gryffindor':                Index    Arithmancy      Astronomy    Herbology  \
 count     327.000000  3.180000e+02     323.000000   320.000000   
 mean      773.773700  4.912200e+04     493.336646    -4.783906   
 std       460.012057  1.516860e+04     195.310218     2.249191   
 min         3.000000 -4.491000e+03    -622.924308    -9.142445   
 25%       382.500000  3.805075e+04     407.222386    -6.142379   
 50%       774.000000  4.843400e+04     504.370715    -5.021176   
 75%      1153.500000  6.012000e+04     624.239863    -3.742059   
 max      1597.000000  9.161400e+04    1016.211940     7.641342   
 sum    253024.000000  1.562080e+07  159347.736787 -1530.849874   
 
        Defense Against the Dark Arts   Divination  Muggle Studies  \
 count                     321.000000   321.000000      317.000000   
 mean                       -4.948000     4.901349     -501.483149   
 std                         1.948529     1.452449      231.878934   
 min                       -10.162

In [8]:
observed_df

Unnamed: 0,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying,Total Courses
Gryffindor,159347.736787,-1530.849874,-1588.308109,1573.333,-158970.158371,191019.035902,-1535.968235,304363.1,949.01678,-44.957018,-82643.0519,61811.02,472750.0
Hufflepuff,257315.037207,2553.172575,-2580.502982,2591.179,-260637.454679,206476.086096,2570.281236,547871.4,2554.753326,-14.651436,-129259.75582,-3748.33,625691.3
Ravenclaw,-208338.698204,2174.464091,2095.868413,2159.758,209829.033749,260618.98605,2123.694786,454794.7,3053.916971,1.680531,-102368.6953,-1763.05,624381.7
Slytherin,-145922.174554,-1408.809189,1464.38485,-1401.017,-141704.63746,117731.465095,1455.530528,306102.5,2784.398522,-25.418411,-75127.5514,-21166.82,42781.87
Total Students,62401.901236,1787.977603,-608.557827,4923.253,-351483.216761,775845.573143,4613.538316,1613132.0,9342.085599,-83.346333,-389399.05442,35132.82,1765605.0


In [9]:
expected_df = observed_df
print(expected_df)

                    Astronomy    Herbology  Defense Against the Dark Arts  \
Gryffindor      159347.736787 -1530.849874                   -1588.308109   
Hufflepuff      257315.037207  2553.172575                   -2580.502982   
Ravenclaw      -208338.698204  2174.464091                    2095.868413   
Slytherin      -145922.174554 -1408.809189                    1464.384850   
Total Students   62401.901236  1787.977603                    -608.557827   

                Divination  Muggle Studies  Ancient Runes  History of Magic  \
Gryffindor        1573.333  -158970.158371  191019.035902      -1535.968235   
Hufflepuff        2591.179  -260637.454679  206476.086096       2570.281236   
Ravenclaw         2159.758   209829.033749  260618.986050       2123.694786   
Slytherin        -1401.017  -141704.637460  117731.465095       1455.530528   
Total Students    4923.253  -351483.216761  775845.573143       4613.538316   

                Transfiguration      Potions  Care of Magical 

In [10]:
observed_df

Unnamed: 0,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying,Total Courses
Gryffindor,159347.736787,-1530.849874,-1588.308109,1573.333,-158970.158371,191019.035902,-1535.968235,304363.1,949.01678,-44.957018,-82643.0519,61811.02,472750.0
Hufflepuff,257315.037207,2553.172575,-2580.502982,2591.179,-260637.454679,206476.086096,2570.281236,547871.4,2554.753326,-14.651436,-129259.75582,-3748.33,625691.3
Ravenclaw,-208338.698204,2174.464091,2095.868413,2159.758,209829.033749,260618.98605,2123.694786,454794.7,3053.916971,1.680531,-102368.6953,-1763.05,624381.7
Slytherin,-145922.174554,-1408.809189,1464.38485,-1401.017,-141704.63746,117731.465095,1455.530528,306102.5,2784.398522,-25.418411,-75127.5514,-21166.82,42781.87
Total Students,62401.901236,1787.977603,-608.557827,4923.253,-351483.216761,775845.573143,4613.538316,1613132.0,9342.085599,-83.346333,-389399.05442,35132.82,1765605.0


In [18]:
len(observed_df.iloc[0])

13

In [11]:
max_col = len(observed_df.iloc[0]) - 1
max_row = len(observed_df) - 1
expected = [[observed_df[max_col][j] * observed_df[i][max_row] / observed_df[max_col][max_row] for j in range(max_col)]
            for i in range(max_row)]
print (expected)

KeyError: 0

In [None]:
data2 = {}
#data2['Total Col'] = {}
for col in groupby[0]:
    data2[col] = {}
    for field in numeric_features:
        row = df[field].loc[df[groupby] == col].dropna()
        #for index in ['count', 'mean', 'std']: #put here other necessity
        data2[col][field] = [mycount(row), mymean(row), mystd(row)] 
        #data2['Total Row'][field]['count'] += mycount(col) # this is wrong
print(data2);
df_heterogienity = pd.DataFrame(data2)