## Data Wrangling: Clean, Transform, Merge, Reshape 
##### - Much of the programming work in data analysis and modeling is spent in data preparation.
##### - That is, data loading, cleaning, transforming, and rearranging.
##### - This is discussed and demonstrated below.

In [1]:
# Importing the necessary libraries 

import pandas as pd 
import numpy as np 

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Performing DataFrame Merges

In [2]:
# Creating Pandas DataFrames 

df1 = pd.DataFrame(
    np.arange(12).reshape(4,3),
    index=list("abcd")
)

df2 = pd.DataFrame(
    np.arange(9).reshape(3,3),
    index=list("abc")
)

In [3]:
# In the below operating, the merging will take place on the overallaping column names 
# as the keys.
# It is, however, good practice to specify the column names where merging will take place

pd.merge(df1, df2)

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [4]:
data_1 = pd.DataFrame(
    {
        'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
        "data": range(7)
    }
)

data_2 = pd.DataFrame(
    {
        'key': ['a', 'b', 'd'],
        "data": range(3)
    }
)

data_2

Unnamed: 0,key,data
0,a,0
1,b,1
2,d,2


In [5]:
data_1

Unnamed: 0,key,data
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [6]:
pd.merge(data_1, data_2, on="key")

Unnamed: 0,key,data_x,data_y
0,b,0,1
1,b,1,1
2,a,2,0
3,a,4,0
4,a,5,0
5,b,6,1


In [7]:
# Creating a Population DataFrame 

df1 = pd.DataFrame({
    "Country": ["America", "Indonesia", "France"],
    "Location": ["New York", "Jakarta", "Paris"],
    "Population": [738100, 575030, 183305]
})

df2 = pd.DataFrame({
    "Country": ["America", "America", "Indonesia", "India", "France", "Greece"],
    "Location": ["New York", "Chicago", "Jakarta", "Mumbai", "Paris", "Yunani"],
    "Income": [1000, 1500, 1400, 1100, 900, 1200]
})

df1

Unnamed: 0,Country,Location,Population
0,America,New York,738100
1,Indonesia,Jakarta,575030
2,France,Paris,183305


In [8]:
df2

Unnamed: 0,Country,Location,Income
0,America,New York,1000
1,America,Chicago,1500
2,Indonesia,Jakarta,1400
3,India,Mumbai,1100
4,France,Paris,900
5,Greece,Yunani,1200


In [9]:
# Specifying the merge column as the key

pd.merge(df1, df2, on="Country")

Unnamed: 0,Country,Location_x,Population,Location_y,Income
0,America,New York,738100,New York,1000
1,America,New York,738100,Chicago,1500
2,Indonesia,Jakarta,575030,Jakarta,1400
3,France,Paris,183305,Paris,900


##### - In a case where the merge columns are different in each DataFrame, 
#####   you can specify them as parameters separately

In [10]:
# Specifying the merge columns seperately as paramters 

pd.merge(df1, df2, left_on="Country", right_on="Country")

Unnamed: 0,Country,Location_x,Population,Location_y,Income
0,America,New York,738100,New York,1000
1,America,New York,738100,Chicago,1500
2,Indonesia,Jakarta,575030,Jakarta,1400
3,France,Paris,183305,Paris,900


##### By default, merge will performs an "inner" join. 
##### However, you can specify the type of join that you want to achieve as with databases "outer", "inner", "left", or "right" joins.
##### - Check out the examples below 

In [11]:
# Performing a left join

pd.merge(df1, df2, right_on="Country", left_on="Country", how="left")

Unnamed: 0,Country,Location_x,Population,Location_y,Income
0,America,New York,738100,New York,1000
1,America,New York,738100,Chicago,1500
2,Indonesia,Jakarta,575030,Jakarta,1400
3,France,Paris,183305,Paris,900


In [18]:
# Performing a right join 

results = pd.merge(df1, df2, right_on="Country", left_on="Country", how="right")
results

Unnamed: 0,Country,Location_x,Population,Location_y,Income
0,America,New York,738100.0,New York,1000
1,America,New York,738100.0,Chicago,1500
2,Indonesia,Jakarta,575030.0,Jakarta,1400
3,India,,,Mumbai,1100
4,France,Paris,183305.0,Paris,900
5,Greece,,,Yunani,1200


### Understanding apply(), applymap(), and map()
___________
### 1). apply()
##### This method is defined for both pandas Series and DataFrame.
##### It allows us to apply functions and alter values along a specific axis. The default axis is columns.

In [27]:
# Importing the titanic dataset 

dataset = pd.read_csv("titanic.csv")
dataset.head(5)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [30]:
# Applying a function that categorizes the passanges based on thier age.
# Then we create a new column for the same 

# Categorizes passengers according to thier age
def age_categorizer(age: int):
    if age < 18:
        return "Child"
    elif age >= 18 and age <= 35: 
        return "Youth"
    else:
        return "Senior Citizen"

# Creating a new col with age_category as a label
dataset["age_category"] = dataset.Age.apply(age_categorizer)
dataset.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age_category
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Youth
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Senior Citizen
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Youth
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Youth
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Youth
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Senior Citizen
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Senior Citizen
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Child
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Youth
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Child


### 2). map()
#### - This method is defined for Pandas Series objects. 
#### - It is used to substitute values in a Series object using a function, dictionary, or another Series object

In [34]:
# Using map() to map gender values to numerical values 
# that is, 0 for male and 1 for female 

def gender_mapping(gender: str): # The function for mapping gender
    return 0 if gender == "male" else 1

dataset["gender_map"] = dataset.Sex.map(gender_mapping)
dataset.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age_category,gender_map
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Youth,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Senior Citizen,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Youth,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Youth,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Youth,0
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Senior Citizen,0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Senior Citizen,0
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Child,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Youth,1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Child,1


In [35]:
# We can also use a dictionary to map gender values to numerical values 

mapping_dict = {
    "male": 0,
    "female": 1
}

dataset.Sex.map(mapping_dict)

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    1
889    0
890    0
Name: Sex, Length: 891, dtype: int64

## Understanding Grouping and Sorting
##### - Grouping our data is very important as it allows us to perform some operations to these data 