## Data Cleaning Challenge
### 1. Data Imports
Importing the required library and reading the data set(data_cleaning_challenge.csv)

In [1]:
import pandas as pd

In [2]:
data_import = pd.read_csv('data_cleaning_challenge.csv')

### 2.Data Understanding
#### Let's Get to know more about our data set

In [3]:
data_import.head()

Unnamed: 0,Row Type,Iter Number,Power1,Speed1,Speed2,Electricity,Effort,Weight,Torque,Unnamed: 9,Unnamed: 10
0,first name: Person,last name: Human,date: end of time,,,,,,,,
1,,,,,,,,,,,
2,Row Type,Iter Number,Power1,Speed1,Speed2,Electricity,Effort,Weight,Torque,,
3,Iter,1,360,108,863,599,680,442,982,,
4,Iter,2,684,508,613,241,249,758,639,,


In [5]:
data_import.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76377 entries, 0 to 76376
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Row Type     58397 non-null  object 
 1   Iter Number  58397 non-null  object 
 2   Power1       58397 non-null  object 
 3   Speed1       52403 non-null  object 
 4   Speed2       52403 non-null  object 
 5   Electricity  52403 non-null  object 
 6   Effort       52403 non-null  object 
 7   Weight       52403 non-null  object 
 8   Torque       52403 non-null  object 
 9   Unnamed: 9   0 non-null      float64
 10  Unnamed: 10  83 non-null     object 
dtypes: float64(1), object(10)
memory usage: 6.4+ MB


## 3. Data Cleaning and Preparation
#### From the above info we can see there's a extra 2 Empty column(Unnamed 9 & Unnamed10) let's go ahead and drop that.

In [6]:
drop_extra = data_import.drop(columns=['Unnamed: 9', 'Unnamed: 10'])

In [7]:
drop_extra

Unnamed: 0,Row Type,Iter Number,Power1,Speed1,Speed2,Electricity,Effort,Weight,Torque
0,first name: Person,last name: Human,date: end of time,,,,,,
1,,,,,,,,,
2,Row Type,Iter Number,Power1,Speed1,Speed2,Electricity,Effort,Weight,Torque
3,Iter,1,360,108,863,599,680,442,982
4,Iter,2,684,508,613,241,249,758,639
...,...,...,...,...,...,...,...,...,...
76372,Iter,6,879,73,977,680,500,395,863
76373,Average,979,641,531,374,448,407,185,439
76374,Maximum,783,172,941,53,982,217,963,502
76375,Std.Dev.,221,112,717,630,239,561,142,909


In [8]:
# Drop NAN values so we can perform our Iteration to assign a number to each Table separated
drop_nans = drop_extra[drop_extra['Row Type'].notna()]

In [9]:
drop_nans

Unnamed: 0,Row Type,Iter Number,Power1,Speed1,Speed2,Electricity,Effort,Weight,Torque
0,first name: Person,last name: Human,date: end of time,,,,,,
2,Row Type,Iter Number,Power1,Speed1,Speed2,Electricity,Effort,Weight,Torque
3,Iter,1,360,108,863,599,680,442,982
4,Iter,2,684,508,613,241,249,758,639
5,Iter,3,365,126,825,407,855,164,86
...,...,...,...,...,...,...,...,...,...
76372,Iter,6,879,73,977,680,500,395,863
76373,Average,979,641,531,374,448,407,185,439
76374,Maximum,783,172,941,53,982,217,963,502
76375,Std.Dev.,221,112,717,630,239,561,142,909


In [10]:
# Now we iterate through the dataframe and assign a Number to each table
columns_value = []
counter = 0
for i in drop_nans['Row Type']:
    if 'first name' in i:
        counter += 1
    columns_value.append(counter)

In [11]:
iter_cols = drop_nans
iter_cols['Iteration'] = columns_value
# Incase you get an error while running the above code try this code below
# iter_cols.loc['iteration'] = columns_value
iter_cols

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iter_cols['Iteration'] = columns_value


Unnamed: 0,Row Type,Iter Number,Power1,Speed1,Speed2,Electricity,Effort,Weight,Torque,Iteration
0,first name: Person,last name: Human,date: end of time,,,,,,,1
2,Row Type,Iter Number,Power1,Speed1,Speed2,Electricity,Effort,Weight,Torque,1
3,Iter,1,360,108,863,599,680,442,982,1
4,Iter,2,684,508,613,241,249,758,639,1
5,Iter,3,365,126,825,407,855,164,86,1
...,...,...,...,...,...,...,...,...,...,...
76372,Iter,6,879,73,977,680,500,395,863,5994
76373,Average,979,641,531,374,448,407,185,439,5994
76374,Maximum,783,172,941,53,982,217,963,502,5994
76375,Std.Dev.,221,112,717,630,239,561,142,909,5994


In [12]:
# Drop extra column names
drop_extra_cols_name_row = iter_cols[iter_cols['Row Type'] != 'Row Type']
drop_extra_cols_name_row

Unnamed: 0,Row Type,Iter Number,Power1,Speed1,Speed2,Electricity,Effort,Weight,Torque,Iteration
0,first name: Person,last name: Human,date: end of time,,,,,,,1
3,Iter,1,360,108,863,599,680,442,982,1
4,Iter,2,684,508,613,241,249,758,639,1
5,Iter,3,365,126,825,407,855,164,86,1
6,Iter,4,764,594,304,718,278,674,774,1
...,...,...,...,...,...,...,...,...,...,...
76372,Iter,6,879,73,977,680,500,395,863,5994
76373,Average,979,641,531,374,448,407,185,439,5994
76374,Maximum,783,172,941,53,982,217,963,502,5994
76375,Std.Dev.,221,112,717,630,239,561,142,909,5994


In [13]:
name_dataframe = drop_extra_cols_name_row[drop_extra_cols_name_row['Row Type'].str.contains('first name')]
# Drop the other column we are not going to be needing them for now ! Dont forget inplace keyword so as not to create a copy.
name_dataframe.drop(columns = ['Speed1', 'Speed2', 'Electricity', 'Effort', 'Weight', 'Torque'], inplace = True)
# Rename our DataFrame 
name_dataframe.rename(columns = {'Row Type':'First Name','Iter Number':'Last Name','Power1':'Date'}, inplace=True)
# Taking the first name "Person"
name_dataframe['First Name'] = name_dataframe['First Name'].str[12:]
# Taking the last name "Human"
name_dataframe['Last Name'] = name_dataframe['Last Name'].str[11:]
# taking the datename "end of date"
name_dataframe['Date'] = name_dataframe['Date'].str[6:]
name_dataframe

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  name_dataframe['First Name'] = name_dataframe['First Name'].str[12:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable

Unnamed: 0,First Name,Last Name,Date,Iteration
0,Person,Human,end of time,1
14,Person,Human,end of time,2
28,Person,Human,end of time,3
42,Person,Human,end of time,4
56,Person,Human,end of time,5
...,...,...,...,...
76309,Person,Human,end of time,5990
76322,Person,Human,end of time,5991
76336,Person,Human,end of time,5992
76350,Person,Human,end of time,5993


In [14]:
# Now les create a new table excluding the name data frame by just using the negation sign "~"
no_name_dataframe = drop_extra_cols_name_row[-drop_extra_cols_name_row['Row Type'].str.contains('first name')]
no_name_dataframe

Unnamed: 0,Row Type,Iter Number,Power1,Speed1,Speed2,Electricity,Effort,Weight,Torque,Iteration
3,Iter,1,360,108,863,599,680,442,982,1
4,Iter,2,684,508,613,241,249,758,639,1
5,Iter,3,365,126,825,407,855,164,86,1
6,Iter,4,764,594,304,718,278,674,774,1
7,Iter,5,487,97,593,206,779,800,123,1
...,...,...,...,...,...,...,...,...,...,...
76372,Iter,6,879,73,977,680,500,395,863,5994
76373,Average,979,641,531,374,448,407,185,439,5994
76374,Maximum,783,172,941,53,982,217,963,502,5994
76375,Std.Dev.,221,112,717,630,239,561,142,909,5994


In [15]:
# Merge the Name data frame and no name data frame together
df = pd.merge(left = name_dataframe, right = no_name_dataframe, how='inner', on ='Iteration')
df.head(10)

Unnamed: 0,First Name,Last Name,Date,Iteration,Row Type,Iter Number,Power1,Speed1,Speed2,Electricity,Effort,Weight,Torque
0,Person,Human,end of time,1,Iter,1,360,108,863,599,680,442,982
1,Person,Human,end of time,1,Iter,2,684,508,613,241,249,758,639
2,Person,Human,end of time,1,Iter,3,365,126,825,407,855,164,86
3,Person,Human,end of time,1,Iter,4,764,594,304,718,278,674,774
4,Person,Human,end of time,1,Iter,5,487,97,593,206,779,800,123
5,Person,Human,end of time,1,Average,182,361,741,231,731,493,847,237
6,Person,Human,end of time,1,Maximum,276,33,97,154,25,922,9,312
7,Person,Human,end of time,1,Std.Dev.,523,1000,34,904,237,600,170,553
8,Person,Human,end of time,1,Total,336,-,-,-,-,977,744,652
9,Person,Human,end of time,2,Iter,1,702,494,311,492,456,370,150


In [16]:
# Now let's check the information of the Data!!
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46409 entries, 0 to 46408
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   First Name   46409 non-null  object
 1   Last Name    46409 non-null  object
 2   Date         46409 non-null  object
 3   Iteration    46409 non-null  int64 
 4   Row Type     46409 non-null  object
 5   Iter Number  46409 non-null  object
 6   Power1       46409 non-null  object
 7   Speed1       46409 non-null  object
 8   Speed2       46409 non-null  object
 9   Electricity  46409 non-null  object
 10  Effort       46409 non-null  object
 11  Weight       46409 non-null  object
 12  Torque       46409 non-null  object
dtypes: int64(1), object(12)
memory usage: 5.0+ MB


## Nice Job!! We where able to clean up a Dirty DataFram to a colplete Clean one! Now it is Ready for EDA and Modeling : )