#### Import Statements

In [1]:
import numpy as np
import pandas as pd

#### Loading the Data Files

----------

In [2]:
sample_dict = {
    "first": ["john", "jane", "dan"],
    "last": ["doe", "doe", "james"],
    "email": ["jd23@gmail.com", "jd21@yahoo.com", "dan543@outlook.com"],
}

df_example = pd.DataFrame(sample_dict)

In [3]:
df_dict = pd.DataFrame(sample_dict)

In [4]:
df = pd.read_csv("PandasSampleData/survey_results_public.csv")

-------

## Accessing the Data

- Accessing a Single Column

In [5]:
df_dict["first"];  # df_dict.first is the same as df_dict["first"]

- Accessing Multiple Columns

In [6]:
df_dict[["first", "last"]];

- Accessing Data by Locations of Rows and Columns

> Note : Row Data can only be accessed using either loc or, iloc.

> Note : for 'iloc' the slicing is similar to list slicing and the end index is exclusive. But, for 'loc' it's inclusive.

In [7]:
df.iloc[0:2, 2:4];

In [8]:
df.loc[0:1, "Hobbyist":"Age"];

## Sorting the Data

In [9]:
# Sort data by priority sequence of columns
df_dict.sort_values(by=["last", "first"], ascending=[True, False], inplace=True);
# setting na_position="first" will show the nan values first and then starts to sort values

In [10]:
# To revert the inplace changes simply sort the DF by index
df_dict.sort_index();

In [11]:
# We can also use .sort_values with Series Objects too
df_dict["last"].sort_values();

## Data Filtering

In [12]:
# we can pass the conditions in df_name to filter the data and create a DF as per the conditions.
filt = (df_dict["last"] == "doe")
filt;

In [13]:
# We can also use loc to filter out data by passing in a boolean series data...
# ...This way we can also define which columns do we want in our new DF.
df_dict.loc[filt, "email"];

> Note : As We Can't use python default "and", "or" for filtering so, we will use "&" and "|".
And To have the negation of a filtering condition we use "~" before the condition as we pass it in.

In [14]:
filt = (df_dict["last"] == "doe") | (df_dict["first"] == "john")
df_dict.loc[~filt, "email"];

> Note : We can also use .where() and .query() method to filter data

> Note : we can also use some of the available str methods i.e., .str.contains(), .str.startswith(), .str.endswith() etc. to filter data.

## Modifying Rows & Columns 

### Modifying Columns

- Renaming Existing Column Names

In [15]:
# Using ".rename" method to rename a particular Column
df_dict.rename(columns={"first": "first name", "last": "last name"}); # by default inplace is set to False
# bad practice as it complicates the use of dot natation to access the columns

In [16]:
# to change names of all of the columns
df_dict.columns = ["email", "first name", "last name"] 
# Changes the column names inplace & sequentially according to the list.

- Modifying the existing column names

In [17]:
# Applying list comprehension to apply some of string methods to the column names
df_dict.columns = [x.capitalize() for x in df_dict.columns]
# Using ".str" Class to apply some of string methods to the column names
df_dict.columns = df_dict.columns.str.replace(" ", "_")

In [18]:
df_dict;

- Adding New Columns

> Note : the difference between these two method is that the first will always add the column to the last of the DF and using the second method we can add the column to any position along the axis 1 by defining the index position.

In [19]:
df_dict["Country"] = "USA"

In [20]:
df_dict.insert(3, column="Full_name", value=df_dict["First_name"] + " " + df_dict["Last_name"])
df_dict;

- Deleting Columns

In [21]:
df_dict.drop(columns=["First_name", "Last_name"], inplace=True)

In [22]:
# Adding back the deleted columns
df_dict[["First_name", "Last_name"]] = df_dict["Full_name"].str.split(" ", expand=True)

In [23]:
df_dict;

### Modifying Rows 

- Updating Row Data

In [24]:
# to update a single row of data
df_dict.loc[0] = ["js23@gmail.com", "john smith", "USA", "john", "smith"]

In [25]:
# to update a particular |row,column| data
df_dict.loc[1, "Last_name"] = "austen"  # we can also pass in filter condition to 'loc'

In [26]:
# Applying string methods to the data of a particular Row
for x in ["First_name", "Last_name", "Full_name"]:
    df_dict[x] = df_dict[x].str.capitalize()

In [27]:
df_dict;

- Adding New Rows

In [28]:
# Adding a Single Row
df_dict = df_dict.append({"Email": "maidulhasan956@gmail.com",
                          "Full_name": "Maidul Hasan",
                          "Country": "BD",
                          "First_name": "Maidul",
                          "Last_name": "Hasan"}, ignore_index=True)

In [29]:
# Adding Multiple Rows
# Pass in a list of the row dicts in the .append method

- Deleting Rows

In [30]:
filt = (df_dict["Country"] == "BD")
df_dict.drop(index= df_dict.loc[filt].index);  # inplace=False

#### Advanced Row Manipulation Using apply, applymap, map & replace

----------

In [31]:
new_df = pd.DataFrame(sample_dict)

-------

- apply

> Note : 'apply' can be used with both Data Frames and Series Objects.

In [32]:
# In a Series object 'apply' applies a function to every value in the series.
new_df["first"] = new_df["first"].apply(lambda x: x.lower())
# In a Data Frame it runs a function on each Series of data along the defined axis of that data frame
new_df.apply(pd.Series.min);  # the default value of axis is axis=0

- applymap

> Note : 'applymap' can only be used on DataFrames.

In [33]:
# 'applymap' runs a function on every single element of the data frame
new_df.applymap(lambda x: x.lower());  # applymap(str.lower) has the samne effect

- map

> Note : 'map' only works on a Series object.

In [34]:
# 'map' changes the element values according to the instruction dictionary and it replaces the remaining values to 'NaN'
new_df["first"].map({"john": "snow", "jane": "chris"});

- replace

> Note : 'replace' can be used on a Series object as well as a Data Frame

In [35]:
# in Series objects 'replace' changes element values according to the instruction dictionary and
# leaves out the remaining values unchanged.
new_df["first"].replace({"john": "snow", "jane": "chris"});
# in a DF 'replace' replces a certain value with another wherever this certain value appears in the DF.

## Modifying the Dataframe as a Whole

----

In [36]:
world = pd.read_csv("PandasSampleData/worldstats.csv", index_col=["country", "year"])

In [37]:
sales = pd.read_csv("PandasSampleData/salesmen.csv", parse_dates=["Date"])

In [38]:
food_sales = pd.read_csv("PandasSampleData/foods.csv")

In [39]:
quarter_sales = pd.read_csv("PandasSampleData/quarters.csv")

-------

- The stack ( ) method 

> What the stack() method does is, it basically pulls/turns all of the columns in the DF to the last level of a multi-indexed DF's row index.

In [40]:
world.stack()

country     year            
Arab World  2015  Population    3.920223e+08
                  GDP           2.530102e+12
            2014  Population    3.842226e+08
                  GDP           2.873600e+12
            2013  Population    3.765043e+08
                                    ...     
Zimbabwe    1962  GDP           1.117602e+09
            1961  Population    3.876638e+06
                  GDP           1.096647e+09
            1960  Population    3.752390e+06
                  GDP           1.052990e+09
Length: 22422, dtype: float64

- The unstack ( ) method 

> unstack() does the opposite of stack(). It unstacks the last/most inner level of a multi-indexed DF to a column index level.

- The pivot ( ) method

> What pivot() method basically does is, it takes in a column as input of columns parameter and converts the unique values in that column to new columns and populates these new columns using the data from the columns that was specified as input of the parameter values.

> Note that, the columns specified in the values parameter becomes the 1st level of the column multi-index and the newly created columns by the pivot() method becomes the 2nd level of the column multi-index.

In [41]:
# sales.info()
# sales["Salesman"].nunique()
sales["Salesman"] = sales["Salesman"].astype("category")
pivot_sales = sales.pivot(index="Date", columns="Salesman", values=["Revenue"])
pivot_sales.head(3)

Unnamed: 0_level_0,Revenue,Revenue,Revenue,Revenue,Revenue
Salesman,Bob,Dave,Jeb,Oscar,Ronald
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2016-01-01,7172,1864,4430,5250,2639
2016-01-02,6362,8278,8026,8661,4951
2016-01-03,5982,4226,5188,7075,2703


- The pivot_table ( ) method 

> The pivot_table() is actually the same as the pivot() method except that, it takes an extra parameter named aggfunc and as a input to this param we can enter an aggregate functions name and this function will be applied to the column specified as the input of the parameter values.

In [42]:
food_sales.head(3);

In [43]:
food_sales.pivot_table(index=["Gender", "City"], columns="Item", values="Spend", aggfunc="mean");

- The melt ( ) method 

> It's the opposite of the pivot() method. It unpivots a DataFrame from wide format to long format.

In [44]:
quarter_sales.melt(id_vars=["Salesman"], value_vars=["Q1", "Q2", "Q3", "Q4"],
                   var_name="Quarter", value_name="Revenue");

## Cleaning the Data - Handling Missing Values & Casting Data types

### Handling Missing Values

In [45]:
df_dict = df_dict.append(
    [{"First_name": "Harry", "Last_name": "Kane", "Email": "N/A", "Full_name": "Harry Kane"}],
    ignore_index=True,
)
df_dict;

> Note : though .dropna() can remove rows/columns that has NaN/None value by the defined conditions but, it can't remove rows/columns that has customized missing values i.e. strings like 'NA'/'Missing' etc. To handle such data we can simply replace those certain strings with '.replace' method. Alternatively if the data was read from a .csv file then we would simply pass in a list of those customized missing values that we want to treat as NaN values while loading the .csv file as a DF. i.e.

> df = pd.read_csv("file path", na_values=['NA', 'N/A', 'Missing'])

- Drop rows/columns with NaN/None values

In [46]:
df_dict.dropna(axis="index", how="any", subset=["Country", "Email"]);  # inplace=False

> Note : Default is set to (axis="index", how="any"). To remove columns set axis="columns".

> how="any" removes rows/columns when 'atleast one' of the <col_name> in the subset parameter has NaN/None value & how="all" removes rows/columns when 'all' of the <col_names> in the subset parameter has NaN/None value.

- Drop rows/columns with custom missing values

In [47]:
df_dict.replace("N/A", np.nan, inplace=True)

In [48]:
df_dict.dropna(subset=["Email"], inplace=True)
df_dict;

> Note : To replace NaN values of the entire DF we can either use .replace() or .fillna() method. To see a mask (True/False DF) of which elements in a certain DataFrame will be treated as a NaN values use .isna() method.

### Casting Data-types

In [49]:
df["Age"].astype(np.str);