In [27]:
### Creating Dataframe using a List

In [28]:
import pandas as pd

name_list = [1, 2, 3]
name_df = pd.DataFrame(name_list, columns=['numbers'])

name_df

Unnamed: 0,numbers
0,1
1,2
2,3


In [29]:
name_dict = { 'names': ['Abidemi', 'Adewale', 'Aduke'], 'age': [1, 2, 3]}
name_df = pd.DataFrame(name_dict, index=['sdial', 'seas', 'esda'])
name_df

Unnamed: 0,names,age
sdial,Abidemi,1
seas,Adewale,2
esda,Aduke,3


### Retrieving data using columns or sequence

In [30]:
name_df.loc['sdial']

names    Abidemi
age            1
Name: sdial, dtype: object

In [31]:
# Data type of Dataframe columns

name_df.dtypes

names    object
age       int64
dtype: object

In [32]:
# Insert row
name_series = pd.Series(name_list[0:2], name=3, index=name_df.columns)

pd.concat([name_df, name_series.to_frame().T], ignore_index=True)

Unnamed: 0,names,age
0,Abidemi,1
1,Adewale,2
2,Aduke,3
3,1,2


In [33]:
# Insert a column

name_df['city'] = [1, 3, 4]

In [34]:
# Delete column

name_df.drop(columns=['city'], inplace=True)
name_df

Unnamed: 0,names,age
sdial,Abidemi,1
seas,Adewale,2
esda,Aduke,3


In [35]:
# Delete row

name_df.drop(index=['esda'])

Unnamed: 0,names,age
sdial,Abidemi,1
seas,Adewale,2


In [36]:
name_df

Unnamed: 0,names,age
sdial,Abidemi,1
seas,Adewale,2
esda,Aduke,3


### Dirty Data & Data Cleaning

The best way to ensure good data quality is by `data cleaning`

In [37]:
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'
company_data = pd.read_html(url)

In [38]:
company_data = company_data[0]

In [39]:
# Removing multiple index
company_data.columns = company_data.columns.get_level_values(0)

In [40]:
company_data.head()

Unnamed: 0,Rank,Name,Industry,Revenue,Profit,Employees,Headquarters[note 1],State-owned,Ref.
0,1,Walmart,Retail,"$648,125","$15,511",2100000,United States,,[1]
1,2,Amazon,Retail,"$574,785","$30,425",1525000,United States,,[4]
2,3,State Grid Corporation of China,Electricity,"$545,948","$9,204",1361423,China,,[5]
3,4,Saudi Aramco,Oil and gas,"$494,890","$129,699",73311,Saudi Arabia,,[6]
4,5,China Petrochemical Corporation,Oil and gas,"$429,700","$9,393",513434,China,,[7]


In [41]:
company_data.drop(columns=['Ref.'], inplace=True)
company_data.head()

Unnamed: 0,Rank,Name,Industry,Revenue,Profit,Employees,Headquarters[note 1],State-owned
0,1,Walmart,Retail,"$648,125","$15,511",2100000,United States,
1,2,Amazon,Retail,"$574,785","$30,425",1525000,United States,
2,3,State Grid Corporation of China,Electricity,"$545,948","$9,204",1361423,China,
3,4,Saudi Aramco,Oil and gas,"$494,890","$129,699",73311,Saudi Arabia,
4,5,China Petrochemical Corporation,Oil and gas,"$429,700","$9,393",513434,China,


In [42]:
# Rename a column
company_data.rename(columns= {'Rank': 'Ranking'}, inplace= True)
company_data.head()

Unnamed: 0,Ranking,Name,Industry,Revenue,Profit,Employees,Headquarters[note 1],State-owned
0,1,Walmart,Retail,"$648,125","$15,511",2100000,United States,
1,2,Amazon,Retail,"$574,785","$30,425",1525000,United States,
2,3,State Grid Corporation of China,Electricity,"$545,948","$9,204",1361423,China,
3,4,Saudi Aramco,Oil and gas,"$494,890","$129,699",73311,Saudi Arabia,
4,5,China Petrochemical Corporation,Oil and gas,"$429,700","$9,393",513434,China,


In [43]:
company_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Ranking               50 non-null     int64  
 1   Name                  50 non-null     object 
 2   Industry              50 non-null     object 
 3   Revenue               50 non-null     object 
 4   Profit                50 non-null     object 
 5   Employees             50 non-null     int64  
 6   Headquarters[note 1]  50 non-null     object 
 7   State-owned           0 non-null      float64
dtypes: float64(1), int64(2), object(5)
memory usage: 3.2+ KB


In [44]:
company_data['Revenue'] = [int(r.replace('$', '').replace(',', '')) for r in company_data['Revenue']]

In [46]:
company_data['Profit'] = [int(r.replace('$', '').replace(',', '')) for r in company_data['Profit']]

In [50]:
company_data.rename(columns= { 'Profit': 'Profit ($)', 'Revenue': 'Revenue ($)'}, inplace= True)
company_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Ranking               50 non-null     int64  
 1   Name                  50 non-null     object 
 2   Industry              50 non-null     object 
 3   Revenue ($)           50 non-null     int64  
 4   Profit ($)            50 non-null     int64  
 5   Employees             50 non-null     int64  
 6   Headquarters[note 1]  50 non-null     object 
 7   State-owned           0 non-null      float64
dtypes: float64(1), int64(4), object(3)
memory usage: 3.2+ KB


In [51]:
company_data[['Revenue ($)', 'Profit ($)']] = company_data[['Revenue ($)', 'Profit ($)']].astype('float')

In [53]:
company_data['Profit ($)'].unique()

array([ 15511.,  30425.,   9204., 129699.,   9393.,  21294.,  13000.,
        96995.,  22381.,  96223.,   8344.,  17945.,  36010.,  19359.,
         4272.,  34214.,   3002.,  73795.,   1745.,   7393.,   6292.,
        49552.,  51417.,  21384.,   4280.,  15239.,  72361.,    261.,
        20103.,  21369.,  46990.,  11082.,   4563.,   5164.,  38049.,
        16988.,   2152.,  17641.,   4347.,  32758.,  26515.,  10127.,
         5987.,  12205.,  15417.,   1701.,   2494.,   9228.,   2702.,
         3414.])

-234