<a href="https://colab.research.google.com/github/mojo8585/Intro-To-Pandas-Profiling/blob/main/Data_Science_Article_3_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intro to Pickling




In [None]:
#import the packages

In [40]:
import pandas as pd
import pickle
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 
sns.set(color_codes=True)

In [None]:
#loading the file

In [2]:
import io
from google.colab import files
uploaded = files.upload()

Saving world-happiness-report.csv to world-happiness-report.csv


In [None]:
#just taking a look at the file to ensure that everything is loaded properly

In [3]:
happy_data = pd.read_csv('world-happiness-report.csv', low_memory=False)
happy_data

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.724,7.370,0.451,50.80,0.718,0.168,0.882,0.518,0.258
1,Afghanistan,2009,4.402,7.540,0.552,51.20,0.679,0.190,0.850,0.584,0.237
2,Afghanistan,2010,4.758,7.647,0.539,51.60,0.600,0.121,0.707,0.618,0.275
3,Afghanistan,2011,3.832,7.620,0.521,51.92,0.496,0.162,0.731,0.611,0.267
4,Afghanistan,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.710,0.268
...,...,...,...,...,...,...,...,...,...,...,...
1944,Zimbabwe,2016,3.735,7.984,0.768,54.40,0.733,-0.095,0.724,0.738,0.209
1945,Zimbabwe,2017,3.638,8.016,0.754,55.00,0.753,-0.098,0.751,0.806,0.224
1946,Zimbabwe,2018,3.616,8.049,0.775,55.60,0.763,-0.068,0.844,0.710,0.212
1947,Zimbabwe,2019,2.694,7.950,0.759,56.20,0.632,-0.064,0.831,0.716,0.235


In [None]:
#looking how large the data set is 

In [4]:
happy_data.shape

(1949, 11)

In [None]:
#listing the names of the columns to see if I want to drop any 

In [5]:
happy_data.columns

Index(['Country name', 'year', 'Life Ladder', 'Log GDP per capita',
       'Social support', 'Healthy life expectancy at birth',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Positive affect', 'Negative affect'],
      dtype='object')

In [None]:
#Looking at the information about my data

In [7]:
happy_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1949 entries, 0 to 1948
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      1949 non-null   object 
 1   year                              1949 non-null   int64  
 2   Life Ladder                       1949 non-null   float64
 3   Log GDP per capita                1913 non-null   float64
 4   Social support                    1936 non-null   float64
 5   Healthy life expectancy at birth  1894 non-null   float64
 6   Freedom to make life choices      1917 non-null   float64
 7   Generosity                        1860 non-null   float64
 8   Perceptions of corruption         1839 non-null   float64
 9   Positive affect                   1927 non-null   float64
 10  Negative affect                   1933 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 167.6+ KB


In [None]:
# something does not look entirely correct for the counts, lets look deeper

In [8]:
happy_data.count()

Country name                        1949
year                                1949
Life Ladder                         1949
Log GDP per capita                  1913
Social support                      1936
Healthy life expectancy at birth    1894
Freedom to make life choices        1917
Generosity                          1860
Perceptions of corruption           1839
Positive affect                     1927
Negative affect                     1933
dtype: int64

In [None]:
#lets see if there are null values in this count, we should drop these! 

In [11]:
print(happy_data.isnull().sum())

Country name                          0
year                                  0
Life Ladder                           0
Log GDP per capita                   36
Social support                       13
Healthy life expectancy at birth     55
Freedom to make life choices         32
Generosity                           89
Perceptions of corruption           110
Positive affect                      22
Negative affect                      16
dtype: int64


In [None]:
#looking at the count all of the null values have been dropped and we are ready to proceed 

In [12]:
happy_data = happy_data.dropna() 
happy_data.count()

Country name                        1708
year                                1708
Life Ladder                         1708
Log GDP per capita                  1708
Social support                      1708
Healthy life expectancy at birth    1708
Freedom to make life choices        1708
Generosity                          1708
Perceptions of corruption           1708
Positive affect                     1708
Negative affect                     1708
dtype: int64

In [None]:
#just double checking everything

In [13]:
print(happy_data.isnull().sum()) 

Country name                        0
year                                0
Life Ladder                         0
Log GDP per capita                  0
Social support                      0
Healthy life expectancy at birth    0
Freedom to make life choices        0
Generosity                          0
Perceptions of corruption           0
Positive affect                     0
Negative affect                     0
dtype: int64


Now we are prepped to make changes for analysis!

In [None]:
#changing the data to categorical for cluster analysis 

In [14]:
happy_cats = happy_data[['Country name','year', 'Life Ladder', 'Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices','Generosity', 'Perceptions of corruption', 'Positive affect', 'Negative affect']].astype('category')

happy_cats.dtypes

Country name                        category
year                                category
Life Ladder                         category
Log GDP per capita                  category
Social support                      category
Healthy life expectancy at birth    category
Freedom to make life choices        category
Generosity                          category
Perceptions of corruption           category
Positive affect                     category
Negative affect                     category
dtype: object

In [15]:
happy_cat_codes = pd.DataFrame()
happy_cat_cols = happy_cats.select_dtypes('category')

for col in happy_cat_cols:
  happy_cat_codes[col] = happy_cats[col].cat.codes



print(happy_cat_codes.shape)
print(happy_cat_codes.dtypes)

(1708, 11)
Country name                        int16
year                                 int8
Life Ladder                         int16
Log GDP per capita                  int16
Social support                      int16
Healthy life expectancy at birth    int16
Freedom to make life choices        int16
Generosity                          int16
Perceptions of corruption           int16
Positive affect                     int16
Negative affect                     int16
dtype: object


In [16]:
happy_cat_codes.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,0,3,88,85,16,117,275,434,461,47,143
1,0,4,292,123,58,123,237,447,429,106,122
2,0,5,428,162,52,133,163,392,293,140,160
3,0,6,116,152,40,139,78,428,313,133,152
4,0,7,101,181,40,145,106,476,356,232,153


In [18]:
from sklearn.cluster import KMeans

#fitting the model
kmeans5 = KMeans(n_clusters=5).fit(happy_cat_codes)

#adding new columns to the dataframe
happy_cats['kmeans5'] = kmeans5.labels_
happy_cats.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,kmeans5
0,Afghanistan,2008,3.724,7.37,0.451,50.8,0.718,0.168,0.882,0.518,0.258,3
1,Afghanistan,2009,4.402,7.54,0.552,51.2,0.679,0.19,0.85,0.584,0.237,3
2,Afghanistan,2010,4.758,7.647,0.539,51.6,0.6,0.121,0.707,0.618,0.275,3
3,Afghanistan,2011,3.832,7.62,0.521,51.92,0.496,0.162,0.731,0.611,0.267,3
4,Afghanistan,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.71,0.268,3


Time to Pickle!

In [23]:
#name of the file we wish to be exported
filename = 'happy_data.pckl'

#Naming the file we want to go out, w means writing and b stands for binary 
outfile = open(filename, 'wb')

In [25]:
#specifying the data that we want to pickle
pickle.dump(happy_data,outfile)

#closing the pickle
outfile.close()

In [28]:
#I am going to repeat the process for all manipulated data
filename_1 = 'happy_cat_codes.pckl'
outfile = open(filename_1, 'wb')

In [29]:
pickle.dump(happy_cat_codes,outfile)
outfile.close()

In [30]:
filename_2 = 'happy_cat_cols.pckl'
outfile = open(filename_2, 'wb')

In [31]:
pickle.dump(happy_cat_cols,outfile)
outfile.close()

In [32]:
filename_3 = 'happy_cats.pckl'
outfile = open(filename_3, 'wb')

In [33]:
pickle.dump(happy_cats,outfile)
outfile.close()

In [None]:
#Checking the directory to see if everything has been imported

In [34]:
import os
os.listdir('.')


['.config',
 'happy_data.pckl',
 'happy_cat_codes.pckl',
 'happy_cat_cols.pckl',
 'world-happiness-report.csv',
 'happy_cats.pckl',
 'sample_data']

In [37]:
#now reading everything back in to show how that works
infile = open('happy_data.pckl','rb')
new_happy = pickle.load(infile)
infile.close()

In [None]:
#renamed just to check 

In [38]:
print(new_happy)

     Country name  year  ...  Positive affect  Negative affect
0     Afghanistan  2008  ...            0.518            0.258
1     Afghanistan  2009  ...            0.584            0.237
2     Afghanistan  2010  ...            0.618            0.275
3     Afghanistan  2011  ...            0.611            0.267
4     Afghanistan  2012  ...            0.710            0.268
...           ...   ...  ...              ...              ...
1944     Zimbabwe  2016  ...            0.738            0.209
1945     Zimbabwe  2017  ...            0.806            0.224
1946     Zimbabwe  2018  ...            0.710            0.212
1947     Zimbabwe  2019  ...            0.716            0.235
1948     Zimbabwe  2020  ...            0.703            0.346

[1708 rows x 11 columns]


In [39]:
new_happy.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.724,7.37,0.451,50.8,0.718,0.168,0.882,0.518,0.258
1,Afghanistan,2009,4.402,7.54,0.552,51.2,0.679,0.19,0.85,0.584,0.237
2,Afghanistan,2010,4.758,7.647,0.539,51.6,0.6,0.121,0.707,0.618,0.275
3,Afghanistan,2011,3.832,7.62,0.521,51.92,0.496,0.162,0.731,0.611,0.267
4,Afghanistan,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.71,0.268


#There we are! Everything is all set and this data can be used again! 