In [57]:
#####################################################################################################
# Exploratory Data Analysis 
# Exploratory Data Analysis is a technique to analyze data with visual techniques and all statistical results.
# We will learn about how to apply these techniques before applying any Machine Learning Models.
#####################################################################################################
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

from scipy.stats import trim_mean

In [65]:
#Read the data
data_set= pd.read_csv("D:\\state.csv")
data_set

Unnamed: 0,State,Population,Unemployment.Rate,Abbreviation
0,Alabama,4779736,5.7,AL
1,Alaska,710231,5.6,AK
2,Arizona,6392017,4.7,AZ
3,Arkansas,2915918,5.6,AR
4,California,37253956,4.4,CA
5,Colorado,5029196,2.8,CO
6,Connecticut,3574097,2.4,CT
7,Delaware,897934,5.8,DE
8,Florida,18801310,5.8,FL
9,Georgia,9687653,5.7,GA


In [13]:
print("type:", type(data_set),"/n/n")

type: <class 'pandas.core.frame.DataFrame'> /n/n


In [15]:
data_set.head(10)

Unnamed: 0,State,Population,Unemployment.Rate,Abbreviation
0,Alabama,4779736,5.7,AL
1,Alaska,710231,5.6,AK
2,Arizona,6392017,4.7,AZ
3,Arkansas,2915918,5.6,AR
4,California,37253956,4.4,CA
5,Colorado,5029196,2.8,CO
6,Connecticut,3574097,2.4,CT
7,Delaware,897934,5.8,DE
8,Florida,18801310,5.8,FL
9,Georgia,9687653,5.7,GA


In [16]:
data_set.tail(10)

Unnamed: 0,State,Population,Unemployment.Rate,Abbreviation
40,South Dakota,814180,2.3,SD
41,Tennessee,6346105,5.7,TN
42,Texas,25145561,4.4,TX
43,Utah,2763885,2.3,UT
44,Vermont,625741,1.6,VT
45,Virginia,8001024,4.1,VA
46,Washington,6724540,2.5,WA
47,West Virginia,1852994,4.0,WV
48,Wisconsin,5686986,2.9,WI
49,Wyoming,563626,2.7,WY


In [66]:
##Code #1 : Adding Column to the dataframe
# Adding a new column with derived data 
data_set['Populationsinmillions']=data_set['Population']/1000000
data_set

Unnamed: 0,State,Population,Unemployment.Rate,Abbreviation,Populationsinmillions
0,Alabama,4779736,5.7,AL,4.779736
1,Alaska,710231,5.6,AK,0.710231
2,Arizona,6392017,4.7,AZ,6.392017
3,Arkansas,2915918,5.6,AR,2.915918
4,California,37253956,4.4,CA,37.253956
5,Colorado,5029196,2.8,CO,5.029196
6,Connecticut,3574097,2.4,CT,3.574097
7,Delaware,897934,5.8,DE,0.897934
8,Florida,18801310,5.8,FL,18.80131
9,Georgia,9687653,5.7,GA,9.687653


In [25]:
data_set.head(10)

Unnamed: 0,State,Population,Unemployment.Rate,Abbreviation,Populationsinmillions
0,Alabama,4779736,5.7,AL,4.779736
1,Alaska,710231,5.6,AK,0.710231
2,Arizona,6392017,4.7,AZ,6.392017
3,Arkansas,2915918,5.6,AR,2.915918
4,California,37253956,4.4,CA,37.253956
5,Colorado,5029196,2.8,CO,5.029196
6,Connecticut,3574097,2.4,CT,3.574097
7,Delaware,897934,5.8,DE,0.897934
8,Florida,18801310,5.8,FL,18.80131
9,Georgia,9687653,5.7,GA,9.687653


In [26]:
#data discription
data_set.describe()

Unnamed: 0,Population,Unemployment.Rate,Populationsinmillions
count,50.0,50.0,50.0
mean,6162876.0,4.066,6.162876
std,6848235.0,1.915736,6.848235
min,563626.0,0.9,0.563626
25%,1833004.0,2.425,1.833004
50%,4436370.0,4.0,4.436369
75%,6680312.0,5.55,6.680312
max,37253960.0,10.3,37.253956


In [27]:
#data information

data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
State                    50 non-null object
Population               50 non-null int64
Unemployment.Rate        50 non-null float64
Abbreviation             50 non-null object
Populationsinmillions    50 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 2.1+ KB


In [67]:
#Code #4 : Renaming a column heading
# Rename column heading as it  
# has '.' in it which will create 
# problems when dealing functions

data_set.rename(columns = {'Unemployment.Rate':'UnEmpRate'}, inplace =True)
#lets check the columns
list(data_set)


['State', 'Population', 'UnEmpRate', 'Abbreviation', 'Populationsinmillions']

In [36]:
# Code #5 : Calculating Mean
Population_mean = data.Population.mean() 
"Population Mean : ", Population_mean 
  

('Population Mean : ', 6162876.3)

In [38]:
UnEmpRate_mean = data_set.UnEmpRate.mean()
"UnEmpRate Mean : ", UnEmpRate_mean

('UnEmpRate Mean : ', 4.066)

In [39]:
#Code #6 : Trimmed mean
# Mean after discarding top and  
# bottom 10 % values eliminating outliers 
Population_TM = trim_mean(data_set.Population,0.1)
"Population trim mean : ", Population_TM

('Population trim mean : ', 4783697.125)

In [41]:
UnEmpRate_TM = trim_mean(data_set.UnEmpRate,0.1)
"UnEmpRate trim mean : ",UnEmpRate_TM

('UnEmpRate trim mean : ', 3.9450000000000003)

In [42]:
#Code #7 : Weighted Mean

# here UnEmpRate rate is weighed as per  
# the state population 


UnEmpRate_WM = np.average(data_set.UnEmpRate, weights = data_set.Population) 
"Weighted UnEmpRate Mean: ", UnEmpRate_WM


('Weighted UnEmpRate Mean: ', 4.445833981123393)

In [43]:
#Code #8 : Median
Population_median = data.Population.median() 
"Population median : ", Population_median 
  

Population median :  4436369.5


In [46]:
UnEmpRate_median = data_set.UnEmpRate.median() 
"UnEmpRate median : ", UnEmpRate_median


('UnEmpRate median : ', 4.0)

In [None]:
import seaborn as sns 

#Visualizing Population per Million
# Plot Population In Millions 
fig, ax1 = plt.subplots() 
fig.set_size_inches(15,  9) 
ax1 = sns.barplot(x ="State", y ="Population", data_set= data_set.sort_values('UnEmpRate'), palette ="Set2")
ax1.set(xlabel ='State', ylabel ='Population In Millions') 
ax1.set_title('Population in Millions by State', size = 20) 
plt.xticks(rotation =-90)

#######################################
