### Data Wrangling
##### Rome Business School

In [1]:
#discovery — gain a better understanding
#structuring — organize data for analysis and comouting
#cleaning — detect format consistency, removing outliers, formatting null values
#enriching — determining whether additional data wold benefit the dataset
#validating — assure data consistency, quality and accuracy
#publishing — prepare the data for publishing

In [2]:
#Pandas DataFrame is used
#2 dimensional labeled data structures with columns, i.e. spreadsheet


In [15]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [40]:
#import pandas

import pandas as pd

In [41]:
#create dataframe

dataframe = pd.DataFrame()

In [42]:
dataframe["Name"] = ['Oliver','Jackson','Eter']
dataframe["Age"] = [12, 15, 18]
dataframe['Driver'] = [True, False, True]

dataframe

Unnamed: 0,Name,Age,Driver
0,Oliver,12,True
1,Jackson,15,False
2,Eter,18,True


In [45]:
#create row with pd Series

new_person = pd.Series(['Jack', 34, True], index=['Name','Age', 'Driver'])
new_person

Name      Jack
Age         34
Driver    True
dtype: object

In [50]:
#Put the 2 together

newdataframe = dataframe.append(new_person, ignore_index=True)
newdataframe

Unnamed: 0,Name,Age,Driver
0,Oliver,12,True
1,Jackson,15,False
2,Eter,18,True
3,Jack,34,True


In [56]:
#load .csv files  dataframe = pd.read.csv(url)

url = 'https://raw.githubusercontent.com/chrisalbon/sim_data/master/titanic.csv'
titanic = pd.read_csv(url)

In [58]:
# to see the top 5 of DF

titanic.head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


In [60]:
# Previewing the DF

titanic.shape

(1313, 6)

In [62]:
#index is an integer indicating the the row position in the DF

newdataframe.iloc[2]

Name      Eter
Age         18
Driver    True
Name: 2, dtype: object

In [64]:
#selecting a few

newdataframe.iloc[1:3]

Unnamed: 0,Name,Age,Driver
1,Jackson,15,False
2,Eter,18,True


In [69]:
#show top rows where driver is True

newdataframe[newdataframe['Driver'] == 'True'].head

<bound method NDFrame.head of Empty DataFrame
Columns: [Name, Age, Driver]
Index: []>

In [68]:
newdataframe

Unnamed: 0,Name,Age,Driver
0,Oliver,12,True
1,Jackson,15,False
2,Eter,18,True
3,Jack,34,True


In [78]:
titanic[titanic['Sex'] == 'female'].head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
6,"Andrews, Miss Kornelia Theodosia",1st,63.0,female,1,1
8,"Appleton, Mrs Edward Dale (Charlotte Lamson)",1st,58.0,female,1,1


In [80]:
# Changing the index

newdataframe = newdataframe.set_index(newdataframe['Name'])
newdataframe

Unnamed: 0_level_0,Name,Age,Driver
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Oliver,Oliver,12,True
Jackson,Jackson,15,False
Eter,Eter,18,True
Jack,Jack,34,True


In [81]:
# now we can easily find a person

newdataframe.loc['Oliver']

Name      Oliver
Age           12
Driver      True
Name: Oliver, dtype: object

In [94]:
#Renaming Columns

newdataframe.rename(columns={'Age': 'Year'}).head(5)

Unnamed: 0_level_0,Name,Year,Driver
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Oliver,Oliver,12,True
Jackson,Jackson,15,False
Eter,Eter,18,True
Jack,Jack,34,True


In [99]:
#Statistics

print(newdataframe['Age'])

Name
Oliver     12
Jackson    15
Eter       18
Jack       34
Name: Age, dtype: int64


In [103]:
#Looking different type of defined values

newdataframe['Driver'].unique()

array([ True, False])

In [105]:
#Findig missing values
#.isnull()

In [106]:
#remove duplicates
#.drop_duplicates()

In [107]:
titanic.drop_duplicates()

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.00,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.00,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0
...,...,...,...,...,...,...
1308,"Zakarian, Mr Artun",3rd,27.00,male,0,0
1309,"Zakarian, Mr Maprieder",3rd,26.00,male,0,0
1310,"Zenni, Mr Philip",3rd,22.00,male,0,0
1311,"Lievens, Mr Rene",3rd,24.00,male,0,0


In [108]:
titanic.shape

(1313, 6)

In [109]:
#grouping rows by values
#.groupby().mean()

In [110]:
titanic.groupby('Sex').mean()

Unnamed: 0_level_0,Age,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,29.396424,0.666667,1.0
male,31.014338,0.166863,0.0


In [111]:
#Lambda Functions
# Simple, small function containing a single expression

In [116]:
titanic.groupby('Sex').apply(lambda x: x.count())

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,462,462,288,462,462,462
male,851,851,468,851,851,851


### Exploratory Data Analysis
refers to the critical proess of performing investigations on data

In [118]:
pip install kaggle

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
Collecting python-slugify
  Downloading python-slugify-4.0.1.tar.gz (11 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Building wheels for collected packages: kaggle, python-slugify
  Building wheel for kaggle (setup.py): started
  Building wheel for kaggle (setup.py): finished with status 'done'
  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73057 sha256=cc983ab85097e5861405aa7d8f1a5c478669a6d07c3ff213ba267d86dc22c0eb
  Stored in directory: c:\users\kisszabo\appdata\local\pip\cache\wheels\29\da\11\144cc25aebdaeb4931b231e25fd34b394e6a5725cbb2f50106
  Building wheel for python-slugify (setup.py): started
  Building wheel for python-slugify (setup.py): finished with status 'done'
  Created wheel for python-slugify: filename=python_slugify-4.0.1-py2.py3-none-any.whl size=6772 sha256=2fd497b518001f2f8141af7026439ddc00e32b3d0eaf64721431d16e81b3c845
  Store

In [130]:
pwd

'C:\\Users\\KISSZABO'