# Chapter 7 : Data Exploration and Transformation

## Dealing with messy Data

### Working on data without column headers

In [1]:
import pandas as pd

In [3]:
#Defining lists
row1 = list([1001.0, 'Pandas Banking', 235000, 248000, 5.5, 2013, 3, 10, 0])
row2 = list([1002.0, 'Pandas Grocery', 196000, 205000, 4.5, 2016, 4, 30, 0])
row3 = list([1003.0, 'Pandas Telecom', 167000, 193000, 15.5, 2010, 11,24,0])
row4 = list([1004.0, 'Pandas Transport', 79000, 90000, 13.9, 2018, 1, 15,1])
row5 = list([1005.5, 'Pandas Insurance', 241000, 264000, 9.5, 2009])

#Defining a DataFrame
data_frame = pd.DataFrame(data=[row1,row2,row3,row4,row5])

#Display DataFrame Values
data_frame

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1001.0,Pandas Banking,235000,248000,5.5,2013,3.0,10.0,0.0
1,1002.0,Pandas Grocery,196000,205000,4.5,2016,4.0,30.0,0.0
2,1003.0,Pandas Telecom,167000,193000,15.5,2010,11.0,24.0,0.0
3,1004.0,Pandas Transport,79000,90000,13.9,2018,1.0,15.0,1.0
4,1005.5,Pandas Insurance,241000,264000,9.5,2009,,,


In [4]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       5 non-null      float64
 1   1       5 non-null      object 
 2   2       5 non-null      int64  
 3   3       5 non-null      int64  
 4   4       5 non-null      float64
 5   5       5 non-null      int64  
 6   6       4 non-null      float64
 7   7       4 non-null      float64
 8   8       4 non-null      float64
dtypes: float64(5), int64(3), object(1)
memory usage: 488.0+ bytes


In [6]:
# Convert column 0 into an int
data_frame[0]=data_frame[0].astype('int')
data_frame[0]

0    1001
1    1002
2    1003
3    1004
4    1005
Name: 0, dtype: int32

In [7]:
# Summing column 2 and column 3 together
data_frame[2] + data_frame[3]

0    483000
1    401000
2    360000
3    169000
4    505000
dtype: int64

In [8]:
# Create list of columns headers
column_names = ["Customer ID", "Customer Name", "2018 Revenue", "2019 Revenue", "Growth", "Start Year", "Start Month", "Start Day", "New Customer"]
column_names

['Customer ID',
 'Customer Name',
 '2018 Revenue',
 '2019 Revenue',
 'Growth',
 'Start Year',
 'Start Month',
 'Start Day',
 'New Customer']

In [10]:
#Replace missing column headers
data_frame.columns = column_names
data_frame

Unnamed: 0,Customer ID,Customer Name,2018 Revenue,2019 Revenue,Growth,Start Year,Start Month,Start Day,New Customer
0,1001,Pandas Banking,235000,248000,5.5,2013,3.0,10.0,0.0
1,1002,Pandas Grocery,196000,205000,4.5,2016,4.0,30.0,0.0
2,1003,Pandas Telecom,167000,193000,15.5,2010,11.0,24.0,0.0
3,1004,Pandas Transport,79000,90000,13.9,2018,1.0,15.0,1.0
4,1005,Pandas Insurance,241000,264000,9.5,2009,,,


In [11]:
# Another example of how you can import a CSV file without column headers

#takes the first row as the column headings
file_url="https://raw.githubusercontent.com/PacktWorkshops/The-Pandas-Workshop/master/Chapter07/Data/retail_purchase_missing_headers.csv"
data_frame = pd.read_csv(file_url)
data_frame

Unnamed: 0,10001,24/05/20,Wheat,4.8lb,€17,Fline Store
0,10002,05/05/20,Fruit Juice,3.1lb,€19,Dello Superstore
1,10003,27/04/20,Vegetables,1.2lb,€15,Javies Retail
2,10004,05/05/20,Oil,3.1lb,€17,Javies Retail
3,10005,27/04/20,Wheat,4.8lb,€13,Javies Retail
4,10006,14/01/20,Butter,3.6lb,€27,Oldi Superstore
5,10007,20/04/20,Oil,4.8lb,€21,Dello Superstore
6,10008,05/05/20,Wheat,3.6lb,€25,Oldi Superstore
7,10009,17/04/20,Fruits,1.2lb,€24,Oldi Superstore
8,10010,15/06/20,Oil,4.4lb,€25,Kanes Store
9,10011,17/06/20,Oil,4.4lb,€16,Fline Store


In [12]:
# Does not take the first row as column headers
data_frame = pd.read_csv(file_url,header=None)
data_frame

Unnamed: 0,0,1,2,3,4,5
0,10001,24/05/20,Wheat,4.8lb,€17,Fline Store
1,10002,05/05/20,Fruit Juice,3.1lb,€19,Dello Superstore
2,10003,27/04/20,Vegetables,1.2lb,€15,Javies Retail
3,10004,05/05/20,Oil,3.1lb,€17,Javies Retail
4,10005,27/04/20,Wheat,4.8lb,€13,Javies Retail
5,10006,14/01/20,Butter,3.6lb,€27,Oldi Superstore
6,10007,20/04/20,Oil,4.8lb,€21,Dello Superstore
7,10008,05/05/20,Wheat,3.6lb,€25,Oldi Superstore
8,10009,17/04/20,Fruits,1.2lb,€24,Oldi Superstore
9,10010,15/06/20,Oil,4.4lb,€25,Kanes Store


In [13]:
#Setting the appropriate headers

column_names = ["Receipt Id", "Date of Purchase", "Product Name", "Product Weight", "Total Price", "Retail shop name"]
data_frame.columns = column_names
data_frame

Unnamed: 0,Receipt Id,Date of Purchase,Product Name,Product Weight,Total Price,Retail shop name
0,10001,24/05/20,Wheat,4.8lb,€17,Fline Store
1,10002,05/05/20,Fruit Juice,3.1lb,€19,Dello Superstore
2,10003,27/04/20,Vegetables,1.2lb,€15,Javies Retail
3,10004,05/05/20,Oil,3.1lb,€17,Javies Retail
4,10005,27/04/20,Wheat,4.8lb,€13,Javies Retail
5,10006,14/01/20,Butter,3.6lb,€27,Oldi Superstore
6,10007,20/04/20,Oil,4.8lb,€21,Dello Superstore
7,10008,05/05/20,Wheat,3.6lb,€25,Oldi Superstore
8,10009,17/04/20,Fruits,1.2lb,€24,Oldi Superstore
9,10010,15/06/20,Oil,4.4lb,€25,Kanes Store
