## Introduction to Data Analysis

In [1]:
import pandas as pd # importing a module using python 

In [2]:
titanic_data = pd.read_csv('train.csv') # reading a csv file using pandas

In [3]:
titanic_data.head() # displaying the first 5 rows of the dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
titanic_data_survived = titanic_data[titanic_data['Survived']==1]

In [6]:
titanic_data_survived.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [7]:
titanic_data_survived['Sex'].value_counts()

female    233
male      109
Name: Sex, dtype: int64

In [8]:
titanic_data_survived['Age'].mean()

28.343689655172415

In [9]:
titanic_data_survived['Fare'].max()

512.3292

In [10]:
titanic_data_survived['Pclass'].value_counts() 

1    136
3    119
2     87
Name: Pclass, dtype: int64

In [11]:
titanic_data_survived.Age.max() 

80.0

## Using functions to manipulate dataframes

In [12]:
def concat_df(train_data, test_data):
    """ Return a concatenated dataframe of train and test """
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

In [13]:
def divide_df(all_data):
    """ Returns a divided dataframe of training and test data """
    return all_data.loc[:890], all_data.loc[891:].drop(['Survived'], axis=1)

In [14]:
train_data = pd.read_csv('train.csv')
train_data.shape

(891, 12)

In [15]:
test_data = pd.read_csv('test.csv')
test_data.shape

(418, 11)

In [16]:
data_all = concat_df(train_data, test_data)

In [17]:
data_all.shape

(1309, 12)

In [18]:
data_all.tail()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
1304,,,S,8.05,"Spector, Mr. Woolf",0,1305,3,male,0,,A.5. 3236
1305,39.0,C105,C,108.9,"Oliva y Ocana, Dona. Fermina",0,1306,1,female,0,,PC 17758
1306,38.5,,S,7.25,"Saether, Mr. Simon Sivertsen",0,1307,3,male,0,,SOTON/O.Q. 3101262
1307,,,S,8.05,"Ware, Mr. Frederick",0,1308,3,male,0,,359309
1308,,,C,22.3583,"Peter, Master. Michael J",1,1309,3,male,1,,2668


In [19]:
train_data.name = 'Training Set'
test_data.name = 'Test Set'
data_all.name = 'All Set' 

df_s = [train_data, test_data]

#print('Number of Training Examples = {}'.format(train_data.shape[0]))
print(f"Number of Training Examples = {train_data.shape[0]}")
print('Number of Test Examples = {}\n'.format(test_data.shape[0]))
print('Training X Shape = {}'.format(train_data.shape))
print('Training y Shape = {}\n'.format(train_data['Survived'].shape[0]))
print('Test X Shape = {}'.format(test_data.shape))
print('Test y Shape = {}\n'.format(test_data.shape[0]))
print(train_data.columns)
print(test_data.columns)

Number of Training Examples = 891
Number of Test Examples = 418

Training X Shape = (891, 12)
Training y Shape = 891

Test X Shape = (418, 11)
Test y Shape = 418

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


## Dealing with Missing Values

In [20]:
train_data.isna().head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
5,False,False,False,False,False,True,False,False,False,False,True,False
6,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,True,False
8,False,False,False,False,False,False,False,False,False,False,True,False
9,False,False,False,False,False,False,False,False,False,False,True,False


In [21]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [23]:
def display_missing(df):    
    for col in df.columns.tolist():          
        #print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
        print(f"{col} column missing values: {df[col].isnull().sum()}")
    print('\n')
    
for df in df_s:
    print('{}'.format(df.name))
    display_missing(df)

Training Set
PassengerId column missing values: 0
Survived column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 177
SibSp column missing values: 0
Parch column missing values: 0
Ticket column missing values: 0
Fare column missing values: 0
Cabin column missing values: 687
Embarked column missing values: 2


Test Set
PassengerId column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 86
SibSp column missing values: 0
Parch column missing values: 0
Ticket column missing values: 0
Fare column missing values: 1
Cabin column missing values: 327
Embarked column missing values: 0




## Something New in Python/Data Analysis

## Sourcing Data

In [24]:
# From Public Repositories: Kaggle, UCI Data Repository 
# From API's 
# Create your own data 

In [25]:
import requests
import json
API_KEY = "AIzaSyBR2ykH-xilkW-4kCA5fXnP8fNZ1owpvJ0" 
TEXT_SEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json?"
query = "Colab Hub Barnawa"

In [26]:
response = requests.get(TEXT_SEARCH_URL+'query='+query+'&key='+API_KEY) 
json_object = response.json() 

In [27]:
print(json_object["results"][0]["formatted_address"])
print(json_object["results"][0]["name"])

No 4 Barnawa Close, Barnawa 800282, Kaduna, Nigeria
CoLab


In [28]:
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'
list_companies = pd.read_html(url)

In [29]:
len(list_companies)

5

In [30]:
list_companies = list_companies[0]

In [31]:
list_companies.head() 

Unnamed: 0_level_0,Rank,Name,Industry,Revenue,Profit,Employees,Headquarters[note 1],State-owned,Ref.
Unnamed: 0_level_1,Rank,Name,Industry,USD millions,USD millions,Employees,Headquarters[note 1],State-owned,Ref.
0,1,Walmart,Retail,"$572,754","$13,673",2300000,United States,,[1]
1,2,"Amazon.com, Inc.",Retail,"$469,822","$33,364",1608000,United States,,[4]
2,3,State Grid Corporation of China,Electricity,"$460,616.9","$7,137.8",871145,China,,[5]
3,4,China National Petroleum Corporation,Oil and gas,"$411,692.9","$9,637.5",1090345,China,,[6]
4,5,China Petrochemical Corporation,Oil and gas,"$401,313.5","$8,316.1",542286,China,,[7]


## Reading Data - Basics Steps

In [32]:
# Data comes in different shapes & forms

### Terminolgies

In [33]:
# Datasets - Raw data
# DataFrame - 2 Dimensional Data in Tabular Structure
# Series - 1 Dimensional
# Documentation 
# Features - Columns
# Attributes 
# Lambda - Talk about the lambda function

In [34]:
# Steps: Business Case/ Problem: How many people will survive if the Titanic happened again
# Look at the Name column, Split names and create a title column
# Look at the Sex column and compare survivers based on gender 

In [35]:
import pandas as pd # Data analysis

In [36]:
titanic_data = pd.read_csv('train.csv') # The method reads a dataset into a dataframe

In [37]:
type(titanic_data)

pandas.core.frame.DataFrame

In [38]:
titanic_data.shape 

(891, 12)

In [39]:
#pd.set_option('display.max_columns', 8)
titanic_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [40]:
titanic_data.columns 

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [41]:
titanic_data_1 = titanic_data[['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]

In [42]:
titanic_data_1.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [43]:
titanic_data_1.iloc[:, 2][234]

'Leyson, Mr. Robert William Norman'

In [44]:
titanic_data_2 = titanic_data_1[['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Cabin', 'Embarked']]

In [45]:
titanic_data_2.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

In [46]:
titanic_data_2.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S


In [47]:
titanic_names = titanic_data['Name'] # Series

In [48]:
type(titanic_names)

pandas.core.series.Series

In [49]:
titanic_names.head(10)

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
5                                     Moran, Mr. James
6                              McCarthy, Mr. Timothy J
7                       Palsson, Master. Gosta Leonard
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                  Nasser, Mrs. Nicholas (Adele Achem)
Name: Name, dtype: object

## Introduction to Numpy

In [None]:
# Numerical Arrays - Efficient storage, Speedy Manipulation
# Powerful Analysis
# Hold different data types - into one data type
# Easy changes & modifications
# Data Generation

In [77]:
import numpy as np 

In [79]:
np.__version__

'1.23.5'

## The Numpy Behavior

In [80]:
age = [23, 45, 65, 21, 69]
age 

[23, 45, 65, 21, 69]

In [82]:
num_trips = [4, 12, 5, 3, 12]
num_trips

[4, 12, 5, 3, 12]

In [85]:
age / num_trips ** 2

TypeError: unsupported operand type(s) for ** or pow(): 'list' and 'int'

In [87]:
sum(num_trips)

36

In [88]:
np_age = np.array(age)
np_numtrips = np.array(num_trips)

In [89]:
np_age

array([23, 45, 65, 21, 69])

In [90]:
type(np_age)

numpy.ndarray

In [91]:
np_age / np_numtrips ** 2

array([1.4375    , 0.3125    , 2.6       , 2.33333333, 0.47916667])

In [92]:
list_obj = ['Pablo', 35, True]

In [93]:
type(list_obj)

list

In [94]:
listobj_array = np.array(list_obj)

In [95]:
listobj_array

array(['Pablo', '35', 'True'], dtype='<U21')

In [108]:
np_age[1]

45

In [110]:
np.random.random((2, 4))

array([[0.6955871 , 0.90731077, 0.85275916, 0.30509072],
       [0.54230437, 0.93817328, 0.16535606, 0.86930888]])

In [111]:
np.arange(1, 15)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [114]:
height = np.random.normal(1.75, 0.20, 500)

In [116]:
height.mean()

1.7602498447050592

## Reading Datasets

In [50]:
# Read the abalone dataset into dataFrame
abalone = pd.read_csv('abalone.data', header=None)

In [51]:
abalone.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9


In [52]:
abalone.columns = [
    'Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
    'Viscera weight', 'Shell weight', 'Rings'
]

In [53]:
abalone.head(3)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9


In [54]:
# Your data could be stored as links
url_covid_data = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/cases_deaths/full_data.csv'

In [55]:
covid_data = pd.read_csv(url_covid_data)

In [56]:
covid_data.shape 

(297659, 10)

In [57]:
covid_data.head() 

Unnamed: 0,date,location,new_cases,new_deaths,total_cases,total_deaths,weekly_cases,weekly_deaths,biweekly_cases,biweekly_deaths
0,2020-01-03,Afghanistan,0.0,0.0,,,,,,
1,2020-01-04,Afghanistan,0.0,0.0,,,,,,
2,2020-01-05,Afghanistan,0.0,0.0,,,,,,
3,2020-01-06,Afghanistan,0.0,0.0,,,,,,
4,2020-01-07,Afghanistan,0.0,0.0,,,,,,


### Understanding Philosophies 

In [58]:
# What is Pandas - Tool for data analysis
# Pandas can read a lot of formats
# You can export DataFrame to csv
# With Pandas, you can convert data into dataframe or a series 
# It helps manipulate data easily 
# Pandas has two Data Structures: DataFrame (2D) & Series(1D)
# Lambda is an anonymous function, it can be applied in one line of code

In [59]:
titanic.head()

NameError: name 'titanic' is not defined

In [None]:
titanic['Sex'].head() # Indexed[], Series is created 

In [None]:
age = pd.Series([23,41,45,56,67])

In [None]:
type(age)

In [None]:
age

In [None]:
# Data types - Statistics
# Quantitative - numbers/floats
# Qualitative - string aka categorical data type (Ordinal or Norminal)


# Data - Categorical - Ordinal - Has order of importance - Org hierarchy 
# Data - Categorical - Norminla - No order of importance - Race

In [None]:
#dir(age)

In [None]:
titanic_data.columns

In [None]:
age_series = titanic_data['Age']

In [None]:
len(dir(age_series))

In [None]:
# .__add__ two underscores/ double underscores/ dunder methods

In [None]:
dir(age_series)

In [None]:
age_series.isin([23.0])

In [None]:
(2) .__add__ (4)

In [None]:
2 + 4 

### Our Own Data

In [60]:
# Learn how to read data formats that are unusual, clean data, transform data...

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import mailbox # for reading mail data

In [3]:
mboxfile = "All mail Including Spam and Trash.mbox" # Variable holding the gmail data
mails = mailbox.mbox(mboxfile) # mbox = variable 
mails 

<mailbox.mbox at 0x7f91d80c7070>

In [4]:
type(mails)

mailbox.mbox

In [5]:
# Output the keys in the mbox
# Use a for loop 
for key in mails[0].keys():
    print(key)

X-GM-THRID
X-Gmail-Labels
Delivered-To
Received
X-Google-Smtp-Source
X-Received
ARC-Seal
ARC-Message-Signature
ARC-Authentication-Results
Return-Path
Received
Received-SPF
Authentication-Results
DKIM-Signature
DKIM-Signature
X-Feedback-Id
X-Mailgun-Sending-Ip
X-Mailgun-Sid
Received
Date
Mime-Version
Content-Type
Subject
From
To
X-Mailgun-Dkim
X-Mailgun-Native-Send
X-Mailgun-Track-Clicks
X-Mailgun-Track-Opens
X-Report-Abuse-To
X-Mailer
X-Mailgun-Variables
Message-Id


### Data Transformation

In [6]:
# Follow steps to transform objects/data to suitable formats

In [7]:
import csv 

In [8]:
with open('mailbox.csv', 'w') as outputfile:
    writer = csv.writer(outputfile)
    writer.writerow(['subject', 'from', 'date', 'to', 'label', 'thread'])
    for message in mails:
        writer.writerow([
            message['subject'], message['from'], message['date'],
            message['to'], message['X-Gmail-Labels'], message['X-GM-THRID']
        ])

In [9]:
dfs = pd.read_csv('mailbox.csv')

In [10]:
dfs.head()

Unnamed: 0,subject,from,date,to,label,thread
0,Notifications & Updates for You: Check if a Py...,Real Python <info@realpython.com>,"Tue, 04 Apr 2023 16:47:15 +0000",peter.okwukogu@gmail.com,"Inbox,Important,Opened,Category Updates",1762264965583084043
1,Brain Food: Next Move Mindset,"""FS (Farnam Street)"" <newsletter@farnamstreetb...","Sun, 09 Apr 2023 09:41:19 +0000 (UTC)",peter.okwukogu@gmail.com,"Inbox,Important,Opened,Category Updates",1762691154010054633
2,"Peter, follow these creators based on your rec...",LinkedIn <messages-noreply@linkedin.com>,"Fri, 7 Apr 2023 11:48:39 +0000 (UTC)",Peter Okwukogu <peter.okwukogu@gmail.com>,"IMAP_$NotJunk,IMAP_NotJunk,Trash,Opened,Catego...",1762517972965411440
3,Promo Promo Promo Fly Air Peace to Mumbai (India),"""Air Peace Limited"" <flyairpeace@flyairpeace.com>","Wed, 29 Mar 2023 01:50:47 -0700 (PDT)",peter.okwukogu@gmail.com,"IMAP_$NotJunk,IMAP_NotJunk,Important,Trash,Ope...",1761691420758929100
4,Updates to User Privacy Notice,eBay <ebay@info.ebay.com>,"Thu, 23 Mar 2023 06:19:44 +0000 (UTC)",ipablo26 <peter.okwukogu@gmail.com>,"IMAP_$NotJunk,IMAP_NotJunk,Important,Trash,Ope...",1761138322890881079


In [70]:
dfs.iloc[0]

subject    Notifications & Updates for You: Check if a Py...
from                       Real Python <info@realpython.com>
date                         Tue, 04 Apr 2023 16:47:15 +0000
to                                  peter.okwukogu@gmail.com
label                Inbox,Important,Opened,Category Updates
thread                                   1762264965583084043
Name: 0, dtype: object

In [11]:
# Concise summary of our DataFrame 
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4876 entries, 0 to 4875
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  4865 non-null   object
 1   from     4876 non-null   object
 2   date     4876 non-null   object
 3   to       4790 non-null   object
 4   label    4875 non-null   object
 5   thread   4876 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 228.7+ KB


In [12]:
# Convert date feature to a datetime format
dfs['date'] = dfs['date'].apply(lambda x: pd.to_datetime(x,
errors='coerce', utc=True))

In [13]:
dfs.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4876 entries, 0 to 4875
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype              
---  ------   --------------  -----              
 0   subject  4865 non-null   object             
 1   from     4876 non-null   object             
 2   date     4876 non-null   datetime64[ns, UTC]
 3   to       4790 non-null   object             
 4   label    4875 non-null   object             
 5   thread   4876 non-null   int64              
dtypes: datetime64[ns, UTC](1), int64(1), object(4)
memory usage: 228.7+ KB


In [23]:
dfs.iloc[0]

subject    Notifications & Updates for You: Check if a Py...
from                       Real Python <info@realpython.com>
date                               2023-04-04 16:47:15+00:00
to                                  peter.okwukogu@gmail.com
label                Inbox,Important,Opened,Category Updates
thread                                   1762264965583084043
Name: 0, dtype: object

In [24]:
# Convert a dataframe into a dataset
dfs.to_csv('gmail.csv')

## Data Refactoring

In [25]:
dfs['from'].head(10)

0                    Real Python <info@realpython.com>
1    "FS (Farnam Street)" <newsletter@farnamstreetb...
2             LinkedIn <messages-noreply@linkedin.com>
3    "Air Peace Limited" <flyairpeace@flyairpeace.com>
4                            eBay <ebay@info.ebay.com>
5    Google Developer Group - GDG Houston <info@ema...
6       AliExpress <transaction@notice.aliexpress.com>
7                             Meetup <info@meetup.com>
8                             Meetup <info@meetup.com>
9    DSN AI Community <dsnai.community@datasciencen...
Name: from, dtype: object

In [26]:
import re # importing the module regular expressions 

In [27]:
def extract_email_ID(string):
    email = re.findall(r'<(.+?)>', string)
    if not email:
        email = list(filter(lambda y: '@' in y, string.split()))
    return email[0] if email else np.nan

In [79]:
# Real Python <info@realpython.com>

In [80]:
#Return - info@realpython.com

In [81]:
# Anything else - nan

In [28]:
dfs['from'] = dfs['from'].apply(lambda pablo: extract_email_ID(pablo))

In [29]:
dfs['from'].head(10)

0                      info@realpython.com
1          newsletter@farnamstreetblog.com
2            messages-noreply@linkedin.com
3              flyairpeace@flyairpeace.com
4                       ebay@info.ebay.com
5                    info@email.meetup.com
6        transaction@notice.aliexpress.com
7                          info@meetup.com
8                          info@meetup.com
9    dsnai.community@datasciencenigeria.ai
Name: from, dtype: object

In [30]:
dfs.head() 

Unnamed: 0,subject,from,date,to,label,thread
0,Notifications & Updates for You: Check if a Py...,info@realpython.com,2023-04-04 16:47:15+00:00,peter.okwukogu@gmail.com,"Inbox,Important,Opened,Category Updates",1762264965583084043
1,Brain Food: Next Move Mindset,newsletter@farnamstreetblog.com,2023-04-09 09:41:19+00:00,peter.okwukogu@gmail.com,"Inbox,Important,Opened,Category Updates",1762691154010054633
2,"Peter, follow these creators based on your rec...",messages-noreply@linkedin.com,2023-04-07 11:48:39+00:00,Peter Okwukogu <peter.okwukogu@gmail.com>,"IMAP_$NotJunk,IMAP_NotJunk,Trash,Opened,Catego...",1762517972965411440
3,Promo Promo Promo Fly Air Peace to Mumbai (India),flyairpeace@flyairpeace.com,2023-03-29 08:50:47+00:00,peter.okwukogu@gmail.com,"IMAP_$NotJunk,IMAP_NotJunk,Important,Trash,Ope...",1761691420758929100
4,Updates to User Privacy Notice,ebay@info.ebay.com,2023-03-23 06:19:44+00:00,ipablo26 <peter.okwukogu@gmail.com>,"IMAP_$NotJunk,IMAP_NotJunk,Important,Trash,Ope...",1761138322890881079


In [31]:
dfs['label'].head(10)[0]

'Inbox,Important,Opened,Category Updates'

In [32]:
dfs[['label', 'from']].tail(10) 

Unnamed: 0,label,from
4866,"Inbox,Important,Opened,Category Updates",mail@sendfoxmail.com
4867,"Inbox,Important,Opened,Category Personal",susan@altschoolafrica.com
4868,"Archived,Sent",peter.okwukogu@gmail.com
4869,"Inbox,Opened,Category Updates",no-reply@globalaihub.com
4870,"Inbox,Important,Opened,Category Updates",noreply@redditmail.com
4871,"Inbox,Important,Opened,Category Personal",dsn.aicommunity@datasciencenigeria.ai
4872,"Inbox,Opened,Category Promotions",russellpollari@substack.com
4873,"Inbox,Important,Opened,Category Personal",oluwaremi20@gmail.com
4874,"Inbox,Important,Opened,Category Updates",noreply@google.com
4875,"Inbox,Opened,Category Updates",connect@analyticsvidhya.com


In [33]:
myemail = 'peter.okwukogu@gmail.com'
dfs['label'] = dfs['from'].apply(lambda pablo: 'sent' if pablo==myemail else 'inbox')

In [34]:
dfs[['label', 'from']].tail(10) 

Unnamed: 0,label,from
4866,inbox,mail@sendfoxmail.com
4867,inbox,susan@altschoolafrica.com
4868,sent,peter.okwukogu@gmail.com
4869,inbox,no-reply@globalaihub.com
4870,inbox,noreply@redditmail.com
4871,inbox,dsn.aicommunity@datasciencenigeria.ai
4872,inbox,russellpollari@substack.com
4873,inbox,oluwaremi20@gmail.com
4874,inbox,noreply@google.com
4875,inbox,connect@analyticsvidhya.com


In [36]:
dfs.tail(10) 

Unnamed: 0,subject,from,date,to,label,thread
4866,A Townhall different... Bulu balaaaaa =?UTF-8?...,mail@sendfoxmail.com,2023-03-15 20:47:22+00:00,Peter <peter.okwukogu@gmail.com>,inbox,1760468132770839821
4867,Re: Bio,susan@altschoolafrica.com,2023-01-13 10:07:34+00:00,Peter Okwukogu <peter.okwukogu@gmail.com>,inbox,1754835546797901440
4868,Bio,peter.okwukogu@gmail.com,2023-01-12 16:46:24+00:00,Susan Odere <susan@altschoolafrica.com>,sent,1754835546797901440
4869,=?UTF-8?B?R2xvYmFsIEFJIEh1YiBwcmVzZW50cyAiRGln...,no-reply@globalaihub.com,2023-03-05 06:08:52+00:00,peter.okwukogu@gmail.com,inbox,1759506892728670804
4870,New message from u/reddit,noreply@redditmail.com,2021-12-10 03:53:01+00:00,peter.okwukogu@gmail.com,inbox,1718729710969130708
4871,Geospatial AI and Mobility Data for Health Wor...,dsn.aicommunity@datasciencenigeria.ai,2023-02-06 10:18:01+00:00,Chinazo Anebelundu <chinazo@datasciencenigeria...,inbox,1757076453013482311
4872,Dealing with finite time and energy,russellpollari@substack.com,2022-07-11 13:04:00+00:00,peter.okwukogu@gmail.com,inbox,1738061642584533368
4873,"One week ""FREE"" Virtual Assistance",oluwaremi20@gmail.com,2022-08-22 09:42:34+00:00,"""peter.okwukogu@gmail.com"" <peter.okwukogu@gma...",inbox,1741853942654212864
4874,Please complete the post-event report form for...,noreply@google.com,2022-11-19 08:29:48+00:00,"peter.okwukogu@gmail.com, robert.thas.john@gma...",inbox,1749912480724162828
4875,Free Access to Real Life Project | Download No...,connect@analyticsvidhya.com,2022-10-14 08:36:19+00:00,<peter.okwukogu@gmail.com>,inbox,1746651627001081818


In [37]:
dfs.drop(columns='to', inplace=True)

In [38]:
dfs.head(3)

Unnamed: 0,subject,from,date,label,thread
0,Notifications & Updates for You: Check if a Py...,info@realpython.com,2023-04-04 16:47:15+00:00,inbox,1762264965583084043
1,Brain Food: Next Move Mindset,newsletter@farnamstreetblog.com,2023-04-09 09:41:19+00:00,inbox,1762691154010054633
2,"Peter, follow these creators based on your rec...",messages-noreply@linkedin.com,2023-04-07 11:48:39+00:00,inbox,1762517972965411440


### Refactoring the date column

In [40]:
import datetime
import pytz

In [42]:
# list of all timezones
pytz.all_timezones

['Africa/Abidjan', 'Africa/Accra', 'Africa/Addis_Ababa', 'Africa/Algiers', 'Africa/Asmara', 'Africa/Asmera', 'Africa/Bamako', 'Africa/Bangui', 'Africa/Banjul', 'Africa/Bissau', 'Africa/Blantyre', 'Africa/Brazzaville', 'Africa/Bujumbura', 'Africa/Cairo', 'Africa/Casablanca', 'Africa/Ceuta', 'Africa/Conakry', 'Africa/Dakar', 'Africa/Dar_es_Salaam', 'Africa/Djibouti', 'Africa/Douala', 'Africa/El_Aaiun', 'Africa/Freetown', 'Africa/Gaborone', 'Africa/Harare', 'Africa/Johannesburg', 'Africa/Juba', 'Africa/Kampala', 'Africa/Khartoum', 'Africa/Kigali', 'Africa/Kinshasa', 'Africa/Lagos', 'Africa/Libreville', 'Africa/Lome', 'Africa/Luanda', 'Africa/Lubumbashi', 'Africa/Lusaka', 'Africa/Malabo', 'Africa/Maputo', 'Africa/Maseru', 'Africa/Mbabane', 'Africa/Mogadishu', 'Africa/Monrovia', 'Africa/Nairobi', 'Africa/Ndjamena', 'Africa/Niamey', 'Africa/Nouakchott', 'Africa/Ouagadougou', 'Africa/Porto-Novo', 'Africa/Sao_Tome', 'Africa/Timbuktu', 'Africa/Tripoli', 'Africa/Tunis', 'Africa/Windhoek', 'Ameri

In [43]:
def refactor_timezone(x):
    est = pytz.timezone('Africa/Lagos')
    return x.astimezone(est)

In [39]:
dfs.date.head()[0]

Timestamp('2023-04-04 16:47:15+0000', tz='UTC')

In [45]:
dfs['date'] = dfs['date'].apply(lambda x: refactor_timezone(x))

In [47]:
dfs.head(3)

Unnamed: 0,subject,from,date,label,thread
0,Notifications & Updates for You: Check if a Py...,info@realpython.com,2023-04-04 17:47:15+01:00,inbox,1762264965583084043
1,Brain Food: Next Move Mindset,newsletter@farnamstreetblog.com,2023-04-09 10:41:19+01:00,inbox,1762691154010054633
2,"Peter, follow these creators based on your rec...",messages-noreply@linkedin.com,2023-04-07 12:48:39+01:00,inbox,1762517972965411440


In [46]:
dfs.date.head()[0]

Timestamp('2023-04-04 17:47:15+0100', tz='Africa/Lagos')

In [48]:
# Convert to days of the week using the date column
dfs['dayofweek'] = dfs['date'].apply(lambda x: x.day_name())

In [49]:
dfs[['date', 'dayofweek']].head()

Unnamed: 0,date,dayofweek
0,2023-04-04 17:47:15+01:00,Tuesday
1,2023-04-09 10:41:19+01:00,Sunday
2,2023-04-07 12:48:39+01:00,Friday
3,2023-03-29 09:50:47+01:00,Wednesday
4,2023-03-23 07:19:44+01:00,Thursday


In [56]:
dfs[dfs['dayofweek']== 'Monday'].shape

(773, 6)

In [57]:
dfs.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4876 entries, 0 to 4875
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype                       
---  ------     --------------  -----                       
 0   subject    4865 non-null   object                      
 1   from       4876 non-null   object                      
 2   date       4876 non-null   datetime64[ns, Africa/Lagos]
 3   label      4876 non-null   object                      
 4   thread     4876 non-null   int64                       
 5   dayofweek  4876 non-null   object                      
dtypes: datetime64[ns, Africa/Lagos](1), int64(1), object(4)
memory usage: 228.7+ KB


In [58]:
# Convert to a category
dfs['dayofweek'] = dfs.dayofweek.astype('category')

In [59]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4876 entries, 0 to 4875
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype                       
---  ------     --------------  -----                       
 0   subject    4865 non-null   object                      
 1   from       4876 non-null   object                      
 2   date       4876 non-null   datetime64[ns, Africa/Lagos]
 3   label      4876 non-null   object                      
 4   thread     4876 non-null   int64                       
 5   dayofweek  4876 non-null   category                    
dtypes: category(1), datetime64[ns, Africa/Lagos](1), int64(1), object(3)
memory usage: 195.7+ KB


In [60]:
# Refactor for time of day
dfs['timeofday'] = dfs['date'].apply(lambda x: x.hour + x.minute/60 + x.second/3600)

In [61]:
# Refactor for hour
dfs['hour'] = dfs['date'].apply(lambda x: x.hour)

In [62]:
# Refactor for year integer
dfs['year_int'] = dfs['date'].apply(lambda x: x.year)

In [63]:
# Refactor for year fraction
dfs['year'] = dfs['date'].apply(lambda x: x.year + x.dayofyear/365.25)

In [64]:
dfs.iloc[0]

subject      Notifications & Updates for You: Check if a Py...
from                                       info@realpython.com
date                                 2023-04-04 17:47:15+01:00
label                                                    inbox
thread                                     1762264965583084043
dayofweek                                              Tuesday
timeofday                                              17.7875
hour                                                        17
year_int                                                  2023
year                                               2023.257358
Name: 0, dtype: object

In [65]:
dfs.index

RangeIndex(start=0, stop=4876, step=1)

In [66]:
dfs.head()

Unnamed: 0,subject,from,date,label,thread,dayofweek,timeofday,hour,year_int,year
0,Notifications & Updates for You: Check if a Py...,info@realpython.com,2023-04-04 17:47:15+01:00,inbox,1762264965583084043,Tuesday,17.7875,17,2023,2023.257358
1,Brain Food: Next Move Mindset,newsletter@farnamstreetblog.com,2023-04-09 10:41:19+01:00,inbox,1762691154010054633,Sunday,10.688611,10,2023,2023.271047
2,"Peter, follow these creators based on your rec...",messages-noreply@linkedin.com,2023-04-07 12:48:39+01:00,inbox,1762517972965411440,Friday,12.810833,12,2023,2023.265572
3,Promo Promo Promo Fly Air Peace to Mumbai (India),flyairpeace@flyairpeace.com,2023-03-29 09:50:47+01:00,inbox,1761691420758929100,Wednesday,9.846389,9,2023,2023.240931
4,Updates to User Privacy Notice,ebay@info.ebay.com,2023-03-23 07:19:44+01:00,inbox,1761138322890881079,Thursday,7.328889,7,2023,2023.224504


In [67]:
# Set date to index
dfs.index = dfs['date']

In [68]:
dfs.head() 

Unnamed: 0_level_0,subject,from,date,label,thread,dayofweek,timeofday,hour,year_int,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-04-04 17:47:15+01:00,Notifications & Updates for You: Check if a Py...,info@realpython.com,2023-04-04 17:47:15+01:00,inbox,1762264965583084043,Tuesday,17.7875,17,2023,2023.257358
2023-04-09 10:41:19+01:00,Brain Food: Next Move Mindset,newsletter@farnamstreetblog.com,2023-04-09 10:41:19+01:00,inbox,1762691154010054633,Sunday,10.688611,10,2023,2023.271047
2023-04-07 12:48:39+01:00,"Peter, follow these creators based on your rec...",messages-noreply@linkedin.com,2023-04-07 12:48:39+01:00,inbox,1762517972965411440,Friday,12.810833,12,2023,2023.265572
2023-03-29 09:50:47+01:00,Promo Promo Promo Fly Air Peace to Mumbai (India),flyairpeace@flyairpeace.com,2023-03-29 09:50:47+01:00,inbox,1761691420758929100,Wednesday,9.846389,9,2023,2023.240931
2023-03-23 07:19:44+01:00,Updates to User Privacy Notice,ebay@info.ebay.com,2023-03-23 07:19:44+01:00,inbox,1761138322890881079,Thursday,7.328889,7,2023,2023.224504


In [76]:
dfs.loc['2023-04-05'].shape

(25, 10)

## Solving Challenges

In [16]:
titanic_data = pd.read_csv('train.csv')

In [17]:
titanic_data.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
titanic_data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [20]:
titanic_data['Age_str'] = titanic_data['Age'].apply(lambda x: str(x))

In [21]:
titanic_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_str'],
      dtype='object')

In [22]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  Age_str      891 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 90.6+ KB


In [120]:
name_titanic = titanic_data['Name']

In [121]:
s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama','hippo'], name='animal')

In [122]:
s 

0      lama
1       cow
2      lama
3    beetle
4      lama
5     hippo
Name: animal, dtype: object

In [123]:
s.isin(['cow', 'lama'])

0     True
1     True
2     True
3    False
4     True
5    False
Name: animal, dtype: bool

In [124]:
name_titanic.isin(['Owen'])

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Name, Length: 891, dtype: bool

In [125]:
name_titanic.head()[0]

'Braund, Mr. Owen Harris'

In [127]:
list_name_titanic = list(name_titanic)

In [129]:
list_name_titanic[0]

'Braund, Mr. Owen Harris'