## Introduction to Data Analysis

In [1]:
import pandas as pd # importing a module using python 

In [2]:
titanic_data = pd.read_csv('train.csv') # reading a csv file using pandas

In [3]:
titanic_data.head() # displaying the first 5 rows of the dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
titanic_data_survived = titanic_data[titanic_data['Survived']==1]

In [7]:
titanic_data_survived.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [10]:
titanic_data_survived['Sex'].value_counts()

female    233
male      109
Name: Sex, dtype: int64

In [11]:
titanic_data_survived['Age'].mean()

28.343689655172415

In [12]:
titanic_data_survived['Fare'].max()

512.3292

In [13]:
titanic_data_survived['Pclass'].value_counts() 

1    136
3    119
2     87
Name: Pclass, dtype: int64

In [15]:
titanic_data_survived.Age.max() 

80.0

## Sourcing Data

In [None]:
# From Public Repositories: Kaggle, UCI Data Repository 
# From API's 
# Create your own data 

In [26]:
import requests
import json
API_KEY = "AIzaSyBR2ykH-xilkW-4kCA5fXnP8fNZ1owpvJ0" 
TEXT_SEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json?"
query = "Colab Hub Barnawa"

In [27]:
response = requests.get(TEXT_SEARCH_URL+'query='+query+'&key='+API_KEY) 
json_object = response.json() 

In [28]:
print(json_object["results"][0]["formatted_address"])
print(json_object["results"][0]["name"])

No 4 Barnawa Close, Barnawa 800282, Kaduna, Nigeria
CoLab


In [29]:
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'
list_companies = pd.read_html(url)

In [31]:
len(list_companies)

5

In [34]:
list_companies = list_companies[0]

In [35]:
list_companies.head() 

Unnamed: 0_level_0,Rank,Name,Industry,Revenue,Profit,Employees,Headquarters[note 1],State-owned,Ref.
Unnamed: 0_level_1,Rank,Name,Industry,USD millions,USD millions,Employees,Headquarters[note 1],State-owned,Ref.
0,1,Walmart,Retail,"$572,754","$13,673",2300000,United States,,[1]
1,2,"Amazon.com, Inc.",Retail,"$469,822","$33,364",1608000,United States,,[4]
2,3,State Grid Corporation of China,Electricity,"$460,616.9","$7,137.8",871145,China,,[5]
3,4,China National Petroleum Corporation,Oil and gas,"$411,692.9","$9,637.5",1090345,China,,[6]
4,5,China Petrochemical Corporation,Oil and gas,"$401,313.5","$8,316.1",542286,China,,[7]


## Reading Data - Basics Steps

In [None]:
# Data comes in different shapes & forms

### Terminolgies

In [None]:
# Datasets - Raw data
# DataFrame - 2 Dimensional Data in Tabular Structure
# Series - 1 Dimensional
# Documentation 
# Features - Columns
# Attributes 

In [2]:
import pandas as pd # Data analysis

In [3]:
titanic_data = pd.read_csv('train.csv') # The method reads a dataset into a dataframe

In [4]:
type(titanic_data)

pandas.core.frame.DataFrame

In [10]:
titanic_data.shape 

(891, 12)

In [9]:
pd.set_option('display.max_columns', 8)
titanic_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,...,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",...,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",...,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",...,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",...,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",...,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",...,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",...,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",...,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",...,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",...,237736,30.0708,,C


In [21]:
titanic_data.columns 

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [22]:
titanic_names = titanic_data['Name'] # Series

In [23]:
type(titanic_names)

pandas.core.series.Series

In [24]:
titanic_names.head(10)

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
5                                     Moran, Mr. James
6                              McCarthy, Mr. Timothy J
7                       Palsson, Master. Gosta Leonard
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                  Nasser, Mrs. Nicholas (Adele Achem)
Name: Name, dtype: object

In [13]:
# Read the abalone dataset into dataFrame
abalone = pd.read_csv('abalone.data', header=None)

In [14]:
abalone.head(3)

Unnamed: 0,0,1,2,3,...,5,6,7,8
0,M,0.455,0.365,0.095,...,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,...,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,...,0.2565,0.1415,0.21,9


In [19]:
abalone.columns = [
    'Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
    'Viscera weight', 'Shell weight', 'Rings'
]

In [20]:
abalone.head(3)

Unnamed: 0,Sex,Length,Diameter,Height,...,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,...,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,...,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,...,0.2565,0.1415,0.21,9


In [15]:
# Your data could be stored as links
url_covid_data = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/cases_deaths/full_data.csv'

In [16]:
covid_data = pd.read_csv(url_covid_data)

In [17]:
covid_data.shape 

(295937, 10)

In [18]:
covid_data.head() 

Unnamed: 0,date,location,new_cases,new_deaths,...,weekly_cases,weekly_deaths,biweekly_cases,biweekly_deaths
0,2020-01-03,Afghanistan,0.0,0.0,...,,,,
1,2020-01-04,Afghanistan,0.0,0.0,...,,,,
2,2020-01-05,Afghanistan,0.0,0.0,...,,,,
3,2020-01-06,Afghanistan,0.0,0.0,...,,,,
4,2020-01-07,Afghanistan,0.0,0.0,...,,,,


### Understanding Philosophies 

In [None]:
# What is Pandas - Tool for data analysis
# Pandas can read a lot of formats
# You can export DataFrame to csv
# With Pandas, you can convert data into dataframe or a series 
# It helps manipulate data easily 

### Our own Data

In [25]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import mailbox # for reading mail data

In [31]:
mboxfile = "All mail Including Spam and Trash.mbox" # Variable holding the gmail data
mbox = mailbox.mbox(mboxfile) # mbox = variable 
mbox 

<mailbox.mbox at 0x7f7887d4ed30>

In [32]:
type(mbox)

mailbox.mbox

In [28]:
# Output the keys in the mbox
# Use a for loop 
for key in mbox[0].keys():
    print(key)

X-GM-THRID
X-Gmail-Labels
Delivered-To
Received
X-Google-Smtp-Source
X-Received
ARC-Seal
ARC-Message-Signature
ARC-Authentication-Results
Return-Path
Received
Received-SPF
Authentication-Results
DKIM-Signature
DKIM-Signature
X-Feedback-Id
X-Mailgun-Sending-Ip
X-Mailgun-Sid
Received
Date
Mime-Version
Content-Type
Subject
From
To
X-Mailgun-Dkim
X-Mailgun-Native-Send
X-Mailgun-Track-Clicks
X-Mailgun-Track-Opens
X-Report-Abuse-To
X-Mailer
X-Mailgun-Variables
Message-Id


### Data Transformation

In [33]:
!pip install mailbox 

Collecting mailbox
  Downloading mailbox-0.4.tar.gz (4.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: mailbox
  Building wheel for mailbox (setup.py) ... [?25ldone
[?25h  Created wheel for mailbox: filename=mailbox-0.4-py3-none-any.whl size=4685 sha256=4a4e68267d3bcde233e9ac774dd979d31d070497204217bf42a3ba4760ebcc75
  Stored in directory: /Users/pablo/Library/Caches/pip/wheels/7b/da/4d/8802e813373b8b39fdec7835de2c613d38006b8bdbccb70463
Successfully built mailbox
Installing collected packages: mailbox
Successfully installed mailbox-0.4
