# Data Wrangling

In [57]:
import numpy as np
import pandas as pd
import seaborn as sns

In [58]:
kashti = sns.load_dataset('titanic')
ks1 = kashti
ks2 = kashti

In [59]:
kashti.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [60]:
# simple operations (Math operators)
# you can do - + * / % ** also
(kashti['age']+6).head(10)

0    28.0
1    44.0
2    32.0
3    41.0
4    41.0
5     NaN
6    60.0
7     8.0
8    33.0
9    20.0
Name: age, dtype: float64

## Dealing with missing values

- In data set missing values are either ? or NaN or N/A, or 0 or a blank cell.
- Jab kabhi data na ho kisi aik row main kisi b aik parameter ka

Code:

1. Koshish karen dobara data collect kar len ya dekh len agar kahin ghalti hy
2. Missing value wala variable (column) hi nikaal den agar data pr effect nahi hta ya simple row or data entry remove kar den.
3. Replace the missing value:
    1. How?
        1. Average value of entire variable or similar data point.
        2. Frequency or MODE replacement
        3. Replace based on other functioins (Data sampler knows that)
        4. ML algorithm can also be used
        5. Leave it like that
    2. Why?
        1. Its better because no data is lost
        2. Less accurate
     

In [61]:
# where exactly missing values are?
kashti.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [62]:
kashti.shape

(891, 15)

In [63]:
# use drop.na method
print(kashti.shape)
kashti.dropna(subset=['deck'], axis=0, inplace=True) # subset refers to column, axis is vertical.
# inplace=True means that the changes are made in the original dataframe

(891, 15)


In [64]:
kashti.isnull().sum() # find again missing values

survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64

In [65]:
# remove na from whole dataframe
# to drop NA
kashti.dropna()
# to update the main dataframe
kashti = kashti.dropna() # remove NA from main dataframe
kashti.isnull().sum()
# no NA values now.

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [66]:
kashti.shape

(182, 15)

In [67]:
ks1.isnull().sum()

survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64

# Replacing missing values with the average of that column

In [68]:
# finding an average (mean)
mean = ks1['age'].mean()
mean

35.77945652173913

In [69]:
# replacing nan with mean of the data (updating as well)
ks1['age'] = ks1['age'].replace(np.nan, mean)

In [70]:
ks1.isnull().sum()
# age is replaced

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       2
class          0
who            0
adult_male     0
deck           0
embark_town    2
alive          0
alone          0
dtype: int64

## Data formatting

- Data ko aik common standard pr lana
- Ensures data is consistent and understandable
    - Easy to gather
    - Easy to work with 
        - Convert g to kg or similar unit for all
        - one standard unit in each column
        - ft != cm

In [71]:
# know the data type and convert it into the known one
kashti.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [72]:
# use this method to convert the data type from one to another
# type casting
kashti['survived'] = kashti['survived'].astype("int32")
kashti.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kashti['survived'] = kashti['survived'].astype("int64")


survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [73]:
# here we will convert the age into days insetad of years
ks1['age'] = ks1['age']*365
# change form float to int
ks1['age'] = ks1['age'].astype("int")
ks1.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,13870,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,12775,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,19710,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,1460,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,21170,0,0,26.55,S,First,woman,False,C,Southampton,yes,True
21,1,2,male,12410,0,0,13.0,S,Second,man,True,D,Southampton,yes,True
23,1,1,male,10220,0,0,35.5,S,First,man,True,A,Southampton,yes,True
27,0,1,male,6935,3,2,263.0,S,First,man,True,C,Southampton,no,False
31,1,1,female,13059,1,0,146.5208,C,First,woman,False,B,Cherbourg,yes,False
52,1,1,female,17885,1,0,76.7292,C,First,woman,False,D,Cherbourg,yes,False


In [74]:
# always rename afterwards
ks1.rename(columns={'age': 'age in days'}, inplace=True)
ks1.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,13870,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,12775,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,19710,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,1460,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,21170,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


### **Data Normalizaion**
- Uniform the data
- Making sure they have same impact
- Ail machli samundar main aik jar main
- Also for computautinal reasons

In [75]:
kashti.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [76]:
ks4 = ks1[["age in days", "fare"]]
ks4.head()

Unnamed: 0,age in days,fare
1,13870,71.2833
3,12775,53.1
6,19710,51.8625
10,1460,16.7
11,21170,26.55


- The above data is really in wide range and we need to normalize and hard to compare
- Normalization change the values to the range of 0-to-1 ( now both variables have similar influence on our models)

## Method of Normalization

1. Simple feature scaling
    - x(new)= x(old)/x(max)
2. Min-Max method
3. Z-score (standard score) -3 -to- +3
4. Log transformation

In [77]:
# simple feature scaling
ks4['fare'] = ks4['fare']/ks4['fare'].max()
ks4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4['fare'] = ks4['fare']/ks4['fare'].max()


Unnamed: 0,age in days,fare
1,13870,0.139136
3,12775,0.103644
6,19710,0.101229
10,1460,0.032596
11,21170,0.051822


In [78]:
# Min-Max method
ks4['fare'] = (ks4['fare'] - ks4['fare'].min())/(ks4['fare'].max() - ks4['fare'].min())
ks4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4['fare'] = (ks4['fare'] - ks4['fare'].min())/(ks4['fare'].max() - ks4['fare'].min())


Unnamed: 0,age in days,fare
1,13870,0.139136
3,12775,0.103644
6,19710,0.101229
10,1460,0.032596
11,21170,0.051822


In [79]:
# Z-score (standard score)
ks4['fare'] = (ks4['fare']-ks4['fare'].mean())/ks4['fare'].std()
ks4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4['fare'] = (ks4['fare']-ks4['fare'].mean())/ks4['fare'].std()


Unnamed: 0,age in days,fare
1,13870,-0.067879
3,12775,-0.311883
6,19710,-0.328489
10,1460,-0.800339
11,21170,-0.668161


In [80]:
# log transformation
ks4['fare'] = np.log(ks4['fare'])
ks4.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4['fare'] = np.log(ks4['fare'])


Unnamed: 0,age in days,fare
1,13870,
3,12775,
6,19710,
10,1460,
11,21170,


# Binning
- Grouping of values into smaller number of values (bins)
- Convert numeric into categories (jawan, bachy, boorhy) or 1-16, 17-30 etc
- To have better understanding of groups
    - low vs mid vs high price

In [81]:
# bins = np.linspace(min(kashti['age']), max(kashti['age']), 15000)
# age_groups = ["Bachy", "Jawan", "Boorhay"]
# kashti['age'] = pd.cut(kashti['age'], bins, labels=age_groups, include_lowest=True)
# kashti['age']
# assignment

### **Converting categoris into dummies**
- easy to use for computation
- Male Female (0, 1)

In [82]:
pd.get_dummies(ks1['sex'])
# assignment
# transfer into dummy values in main dataframe

Unnamed: 0,female,male
1,1,0
3,1,0
6,0,1
10,1,0
11,1,0
...,...,...
871,1,0
872,0,1
879,1,0
887,1,0
