In [1]:
import thinkplot
import thinkstats2
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
from matplotlib import pyplot as plt

# Assignment #1 - Basic Data Exploration in Python

## Assignment Overview

In this assignment you'll load some data into a python notebook, and use some basic functions to do some basic analysis. Each section asks you to either calculate some answers or explore some information found in the data. When generating your answers, try to think about a few factors that can make your code better:
<ul>
<li> Present the answers clearly. Use the markdown cells, code comments, and formatting to help make your answers readable. One of the features of notebooks is that they allow us to combine code and commentary, and we need to ensure both are readable. Refer to the guide in the guides folder of the exercises workbook for an explaination and examples of different formatting. 
<li> Make your code clear. It is easy to make sense of small pieces of code, for short periods of time, so if your code makes little sense here, it won't really hurt your ability to find the answers. If you need to come back to it later, or others need to edit it, having code that doesn't make sense is a big issue. Use things like clearly named variables, comments, and spacing to make things readable. Even in this course, if you are looking back to something from 2 months ago for the project, it is far easier to understand code that is cleaned up a little. 
<li> Structure the code well. If there is some kind of repetitive task, it should likely be moved into a function. If there is something that happens several times, it should be in a loop. Having well structured code makes it easy to reuse stuff later, understand how things work, debug errors, and share code with others. This is something to keep in the back of your minds, right now you may not have much experience to lean on to judge how things should be, as you read, adjust, and write code it will become more clear. 
</ul>

## Peer Evaluation

This assignemnt will have a peer evaluation for the grade - you'll each read some other notebooks and evaluate their answers. The reason for this is not (entirely) my personal laziness, we are hoping to accomplish a couple of things:
<ul>
<li> Quick experience reading notebooks. One of the important skills we need to have is the ability to read examples that we find and adapting it to our scenario. This is practice with data you're now comfortable with.
<li> Critically evaluating what makes sense in a data notebook. You know how you attempted to explain your look at the data, how did other people do it? What was better? What was worse?
<li> Design is subjective. This will give you a set of opinions that are not mine. You will end up with plenty of feedback of what I think, feadback of others early-on should hopefully help you make things appealing to all and not tailored to me. 
</ul>
Doing this right up front will hopefully help kickstart some of that expericence. 

### Grading

The detailed grading is defined in the marking guide in the Moodle workshop. Note that there's points both for getting a correct answer and for presenting it in a way that makes sense. 

## Structure

Each section contains a set of questions/prompts and a rough framework for your responses. You can change the structure around, just make sure it is readable. What I have there is just a loose guide to help get you started, you're not bound to it. 

## Load Data

Load the labor data into a dataframe. You probably don't want to change this section. 

In [2]:
#Load/preview data
df = pd.read_csv("LabourTrainingEvaluationData.csv")
df.head()

Unnamed: 0,Age,Eduacation,Race,Hisp,MaritalStatus,Nodeg,Earnings_1974,Earnings_1975,Earnings_1978
0,45,LessThanHighSchool,NotBlack,NotHispanic,Married,1,21516.67,25243.55,25564.67
1,21,Intermediate,NotBlack,NotHispanic,NotMarried,0,3175.971,5852.565,13496.08
2,38,HighSchool,NotBlack,NotHispanic,Married,0,23039.02,25130.76,25564.67
3,48,LessThanHighSchool,NotBlack,NotHispanic,Married,1,24994.37,25243.55,25564.67
4,18,LessThanHighSchool,NotBlack,NotHispanic,Married,1,1669.295,10727.61,9860.869


In [3]:
df.describe()

Unnamed: 0,Age,Nodeg,Earnings_1974,Earnings_1975,Earnings_1978
count,15992.0,15992.0,15992.0,15992.0,15992.0
mean,33.225238,0.295835,14016.800304,13650.803376,14846.659673
std,11.045216,0.456432,9569.795893,9270.403225,9647.391524
min,16.0,0.0,0.0,0.0,0.0
25%,24.0,0.0,4403.45225,4398.823,5669.298
50%,31.0,0.0,15123.58,14557.11,16421.975
75%,42.0,1.0,23584.18,22923.7375,25564.67
max,55.0,1.0,25862.32,25243.55,25564.67


## Part 1 - Age

<ol>
<li> Make and plot a Hist and Pmf for age.
<li> What fraction of the people in the data are 51? What fraction are older than 51?
<li> What is the median age? 
<li> Does the distribution of the sample data seem to mirror the working age population?
</ol>

#### Make and plot a Hist and Pmf for age.

In [None]:
from ctypes import alignment
import thinkplot
import thinkstats2
import pandas as pd
import numpy as np

data_path = "LabourTrainingEvaluationData.csv"
BSIP_1 = pd.read_csv(data_path)

Age_data= BSIP_1["Age"]
Age_hist=thinkstats2.Hist(BSIP_1['Age'])
thinkplot.Hist(Age_hist)
thinkplot.Config(ylabel="Count", xlabel=" Age")
Age_data_mean= Age_data.mean()
Age_data_median= Age_data.median()
print("The mean of the Age data is:",round((Age_data_mean),2))
print("The median of the Age data is:",round((Age_data_median),2))


Age_pmf=thinkstats2.Pmf(BSIP_1['Age'])
thinkplot.Pmf(Age_pmf)
thinkplot.Config(ylabel="Count", xlabel=" Age")

#### What fraction of the people in the data are 51? What fraction are older than 51?

In [None]:
Age_data= Age_data[~np.isnan(Age_data)]
age_fiftyone =sum(Age_data==51)
age_fiftyoneabove =sum(Age_data>51)

frac_age_fiftyone= ((BSIP_1.shape[0])//age_fiftyone )
frac_age_fiftyoneabove= ((BSIP_1.shape[0])//age_fiftyoneabove )
print ("The fraction of people age 51 is;", frac_age_fiftyone)
print ("The fraction of people age above 51 is;", frac_age_fiftyoneabove)

##### What is the median age?

In [None]:
Age_data_mean= Age_data.mean()
Age_data_median= Age_data.median()
print("The mean of the Age data is:",round((Age_data_mean),2))
print("The median of the Age data is:",round((Age_data_median),2))

##### Does the distribution of the sample data seem to mirror the working age population?

### Part 2 - Demographics

<ul>
<li>Consider some of the demographic features: 
    <ul>
    <li>Education
    <li>Race
    <li>Hisp
    <li>MaritalStatus
    <li>Nodeg. 
    </ul>
</ul>
<ol>
<li>This data came from the United States, does it appear to be a representative sample of the US population?
<li>Demonstrate this in some way in your code. 
</ol>

##### Exploring the data...

In [None]:
#Based on the count and sus per US Census Bureau, the population as at Jan,2022 is 332,403,650. 
#sorted value for Education show that less than 10% of the  current population was analysed. 
# Hence this is not a true repesentation of the population

us_population= 332403650
print (us_population)


data_path = "LabourTrainingEvaluationData.csv"
BSIP_1 = pd.read_csv(data_path)
print(BSIP_1.describe(include="all"))


width=0.4
education_data= BSIP_1["Eduacation"]
education_hist=thinkstats2.Hist(BSIP_1['Eduacation'])
thinkplot.Hist(education_hist,align='left',width=width)
thinkplot.Config(ylabel="Count", xlabel=" Category")

BSIP_1['Eduacation'].value_counts().sort_index()

In [None]:
#Based on the count from "Hisp" data and US Census Bureau, the population as at Jan,2022 is 332,403,650. 
#sorted value for Hisp show that less than 10% of the  current population was analysed. 
# Hence this is not a true repesentation of the population
width=0.4
hisp_data= BSIP_1["Hisp"]
hisp_hist=thinkstats2.Hist(BSIP_1['Hisp'])
thinkplot.Hist(hisp_hist,align='left',width=width)
thinkplot.Config(ylabel="Count", xlabel=" Category")

BSIP_1['Hisp'].value_counts().sort_index()

##### Demographic Conclusion


In [None]:
#Based on the count from "Marital Status" data and US Census Bureau, the population as at Jan,2022 is 332,403,650. 
#sorted value for Marital Status show that less than 10% of the  current population was analysed. 
# Hence this is not a true repesentation of the population
width=0.4
ms_data= BSIP_1["MaritalStatus"]
ms_hist=thinkstats2.Hist(BSIP_1['MaritalStatus'])
thinkplot.Hist(ms_hist,align='left',width=width)
thinkplot.Config(ylabel="Count", xlabel=" Category")

BSIP_1['MaritalStatus'].value_counts().sort_index()

In [None]:
#Based on the count from "No deg" data and US Census Bureau, the population as at Jan,2022 is 332,403,650. 
#sorted value for No deg show that less than 10% of the  current population was analysed. 
# Hence this is not a true representation of the population
width=0.4
nodeg_data= BSIP_1["Nodeg"]
nodeg_hist=thinkstats2.Hist(BSIP_1['Nodeg'])
thinkplot.Hist(ms_hist,align='left',width=width)
thinkplot.Config(ylabel="Count", xlabel=" Category")

BSIP_1['Nodeg'].value_counts().sort_index()

In [None]:
#Based on the count from "Race" data and US Census Bureau, the population as at Jan,2022 is 332,403,650. 
#sorted value for Race show that less than 10% of the  current population was analysed. 
# Hence this is not a true repesentation of the population
width=0.4
race_data= BSIP_1["Race"]
race_hist=thinkstats2.Hist(BSIP_1['Race'])
thinkplot.Hist(race_hist,align='left',width=width)
thinkplot.Config(ylabel="Count", xlabel=" Category")

BSIP_1['Race'].value_counts().sort_index()

### Part 3 - Earnings

<ol>
<li>Make and plot a graph or graph of your choosing of the 3 earnings values, in order to answer the below question. Identify how the graph gave you your answer.
    <ul>
    <li>What is one conclusion could you draw from visualizing of the earnings in the different year? Please express it in plain language/non-statistics terms/as though you were explaining to one of your friends what happened to earnings between 1974 and 1978?
    </ul>
<li>Which has the greatest effect size on 1978 earnings: Race, Hispanic, MaritalStatus, or Nodeg? 
    <ul>
    <li>What could you investigate further in an attempt to explain this?
    </ul>
<li>Plot a histogram and PMF, and compute useful descriptive statistics (think: average...) for the 1978 earnings value. Use the "Cohorts" code from the quiz to break the data into cohorts, plotting each group (either on one chart, or separately, whichever makes the most sense for examining the data - state specifically why you chose 1 vs many charts.
    <ul>
    <li>What is the difference in median income between the groups?
    <li>Theorize a reason for the difference between the groups that could be explained with more data. Briefly describe the data you'd need. This does not need to be something you have data for, or know how to solve right now - just one well founded hypothesis on what might explain the difference.
    </ul>
<li>Are there outliers in the 1978 earnings data? Demonstrate this in some way with your code. 
    <ul>
    <li>What can you infer from the presence of outliers that may impact analysis of this data.
    </ul>
</ol>

##### Plot Earnings Data

In [None]:
import matplotlib.pyplot as plt 

data_path = "LabourTrainingEvaluationData.csv"
BSIP_1 = pd.read_csv(data_path)
data=BSIP_1
data=pd.DataFrame(data=data)
age=data.Age.values
earnings_1974=data.Earnings_1974.values
earnings_1975=data.Earnings_1975.values
earnings_1978=data.Earnings_1978.values
plt.hist(earnings_1974,bins=50)
plt.xlabel('Earnings_1974 ($)')
plt.ylabel('Count')
plt.show()
plt.hist(earnings_1975,bins=50)
plt.xlabel('Earnings_1975 ($)')
plt.ylabel('Count')
plt.show()
plt.hist(earnings_1978,bins=50)
plt.xlabel('Earnings_1978 ($)')
plt.ylabel('Count')


print (round((BSIP_1["Earnings_1974"].describe(include="all")),2),round((BSIP_1["Earnings_1975"].describe(include="all")),2),round((BSIP_1["Earnings_1978"].describe(include="all")),2))



As a conclusion, there seems to be a large jump in the number of people earning near the top end, around $25k. Otherwise the distributions are pretty similar. 

##### What to Investigate Further


##### Histogram and PMF

In [4]:
#QUestion
#Create a hist, pmf, and stats for 1987 Earnings
#Note: if you don't round or otherwise create bins, the thinkstats plots for this will be weird
#That's because there are very few values that are the same, e.g. $14762.34, $15672.86, etc... 
#Every cent is a different value. 
#Seaborn plots will probably look better by default. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import statsmodels.api as sm

data_path = "LabourTrainingEvaluationData.csv"
BSIP_1 = pd.read_csv(data_path)
data=BSIP_1
data=pd.DataFrame(data=data)







# Estimating mean value for each group

Mar_Data= data[data['MaritalStatus'].str.contains('Married')].Earnings_1978.mean()
Not_Mar_Data= data[data['MaritalStatus'].str.contains('NotMarried')].Earnings_1978.mean()

Race_data= data[data['Race'].str.contains('Black')].Earnings_1978.mean()
Race_data_2= data[data['Race'].str.contains('NotBlack')].Earnings_1978.mean()

Hisp_data= data[data['Hisp'].str.contains('Hispanic')].Earnings_1978.mean()
Hisp_data_2= data[data['Hisp'].str.contains('NotHispanic')].Earnings_1978.mean()

Nodeg_data= data[data['Nodeg']>0].Earnings_1978.mean()
Nodeg_data_2= data[data['Nodeg']==0].Earnings_1978.mean()

objects = ('Married', 'Not Married', 'Black', 'Not Black', 'Hispanic', 'Not Hispanic','Degree', 'No Degree')
y_pos = np.arange(len(objects))
groups = [Mar_Data,Not_Mar_Data,Race_data,Race_data_2,Hisp_data,Hisp_data_2,Nodeg_data,Nodeg_data_2]
plt.figure(figsize=(8,5))
plt.bar(y_pos, groups, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Mean Earning 1978')

plt.show()

Mar_Data= data[data['MaritalStatus'].str.contains('Married')].Earnings_1978.max()
Not_Mar_Data= data[data['MaritalStatus'].str.contains('NotMarried')].Earnings_1978.max()

Race_data= data[data['Race'].str.contains('Black')].Earnings_1978.max()
Race_data_2= data[data['Race'].str.contains('NotBlack')].Earnings_1978.max()

Hisp_data= data[data['Hisp'].str.contains('Hispanic')].Earnings_1978.max()
Hisp_data_2= data[data['Hisp'].str.contains('NotHispanic')].Earnings_1978.max()

Nodeg_data= data[data['Nodeg']>0].Earnings_1978.max()
Nodeg_data_2= data[data['Nodeg']==0].Earnings_1978.max()

objects = ('Married', 'Not Married', 'Black', 'Not Black', 'Hispanic', 'Not Hispanic','Degree', 'No Degree')
y_pos = np.arange(len(objects))
groups = [Mar_Data,Not_Mar_Data,Race_data,Race_data_2,Hisp_data,Hisp_data_2,Nodeg_data,Nodeg_data_2]
plt.figure(figsize=(8,5))
plt.bar(y_pos, groups, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Max Earning 1978')

plt.show()

Mar_Data= data[data['MaritalStatus'].str.contains('Married')].Earnings_1978.min()
Not_Mar_Data= data[data['MaritalStatus'].str.contains('NotMarried')].Earnings_1978.min()

Race_data= data[data['Race'].str.contains('Black')].Earnings_1978.min()
Race_data_2= data[data['Race'].str.contains('NotBlack')].Earnings_1978.min()

Hisp_data= data[data['Hisp'].str.contains('Hispanic')].Earnings_1978.min()
Hisp_data_2= data[data['Hisp'].str.contains('NotHispanic')].Earnings_1978.min()

Nodeg_data= data[data['Nodeg']>0].Earnings_1978.min()
Nodeg_data_2= data[data['Nodeg']==0].Earnings_1978.min()

objects = ('Married', 'Not Married', 'Black', 'Not Black', 'Hispanic', 'Not Hispanic','Degree', 'No Degree')
y_pos = np.arange(len(objects))
groups = [Mar_Data,Not_Mar_Data,Race_data,Race_data_2,Hisp_data,Hisp_data_2,Nodeg_data,Nodeg_data_2]
plt.figure(figsize=(8,5))
plt.bar(y_pos, groups, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Max Earning 1978')

# The trend in the mean,max and min earnings in 1978 can be used to estimate the most influncial variable.

# Marriage status seems to be the key driver as un married individuals on average earned less than married individuals
Mar_Data= data[data['MaritalStatus'].str.contains('Married')].Earnings_1978
Not_Mar_Data= data[data['MaritalStatus'].str.contains('NotMarried')].Earnings_1978

Race_data= data[data['Race'].str.contains('Black')].Earnings_1978
Race_data_2= data[data['Race'].str.contains('NotBlack')].Earnings_1978

Hisp_data= data[data['Hisp'].str.contains('Hispanic')].Earnings_1978
Hisp_data_2= data[data['Hisp'].str.contains('NotHispanic')].Earnings_1978

Nodeg_data= data[data['Nodeg']>0].Earnings_1978
Nodeg_data_2= data[data['Nodeg']==0].Earnings_1978

def CohenEffectSize(group1, group2):
    """Computes Cohen's effect size for two groups.
    
    group1: Series or DataFrame
    group2: Series or DataFrame
    
    returns: float if the arguments are Series;
             Series if the arguments are DataFrames
    """
    diff = group1.mean() - group2.mean()

    var1 = group1.var()
    var2 = group2.var()
    n1, n2 = len(group1), len(group2)

    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
    d = diff / np.sqrt(pooled_var)
    return d
print('Effect Size Marriage Status',CohenEffectSize( Mar_Data,Not_Mar_Data))

print('Effect Size Race',CohenEffectSize( Race_data,Race_data_2))


print('Effect Size Hisp', CohenEffectSize( Hisp_data,Hisp_data_2))

print('Effect size No Deg',CohenEffectSize( Nodeg_data,Nodeg_data_2))

#Maximum Effect SIze is Marriage Status -Not Married


##### Differences in Median Income, and Theory on Why


##### Outliers

##### Outlier Conclusions
