In [1]:
import os
import pandas as pd

df = pd.read_csv("https://data.heatonresearch.com/data/t81-558/auto-mpg.csv")
print(df[0:5])

    mpg  cylinders  displacement horsepower  weight  acceleration  year  \
0  18.0          8         307.0        130    3504          12.0    70   
1  15.0          8         350.0        165    3693          11.5    70   
2  18.0          8         318.0        150    3436          11.0    70   
3  16.0          8         304.0        150    3433          12.0    70   
4  17.0          8         302.0        140    3449          10.5    70   

   origin                       name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  


## The display function provides a cleaner display than merely printing dataframe. Specifying the maximum rows and columns allows you to achieve greater control of the display

In [2]:
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 5)
display(df)

Unnamed: 0,mpg,cylinders,displacement,...,year,origin,name
0,18.0,8,307.0,...,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,...,70,1,buick skylark 320
...,...,...,...,...,...,...,...
396,28.0,4,120.0,...,82,1,ford ranger
397,31.0,4,119.0,...,82,1,chevy s-10


## It is possible to generate a second dataframe to display statistical information about the first data frame 

In [3]:
# Strip non-numerics
df = df.select_dtypes(include=['int', 'float'])

headers = list(df.columns.values)
fields = []

for field in headers:
    fields.append({
        'name': field,
        'mean': df[field].mean(),
        'var': df[field].var(),
        'sdev': df[field].std()
    })
for field in fields:
    print(field)

{'name': 'mpg', 'mean': 23.514572864321615, 'var': 61.089610774274405, 'sdev': 7.815984312565782}
{'name': 'cylinders', 'mean': 5.454773869346734, 'var': 2.8934154399199943, 'sdev': 1.7010042445332094}
{'name': 'displacement', 'mean': 193.42587939698493, 'var': 10872.199152247364, 'sdev': 104.26983817119581}
{'name': 'weight', 'mean': 2970.424623115578, 'var': 717140.9905256768, 'sdev': 846.8417741973271}
{'name': 'acceleration', 'mean': 15.568090452261291, 'var': 7.604848233611381, 'sdev': 2.7576889298126757}
{'name': 'year', 'mean': 76.01005025125629, 'var': 13.672442818627143, 'sdev': 3.697626646732623}
{'name': 'origin', 'mean': 1.5728643216080402, 'var': 0.6432920268850575, 'sdev': 0.8020548777266163}


### This code outputs a list of dictionaries that hold this statistical information. This information looks similar to the JSON

In [4]:
pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 0)
df2 = pd.DataFrame(fields)
display(df2)

Unnamed: 0,name,mean,var,sdev
0,mpg,23.514573,61.089611,7.815984
1,cylinders,5.454774,2.893415,1.701004
2,displacement,193.425879,10872.199152,104.269838
3,weight,2970.424623,717140.990526,846.841774
4,acceleration,15.56809,7.604848,2.757689
5,year,76.01005,13.672443,3.697627
6,origin,1.572864,0.643292,0.802055


### Missing Values
    Missing values are a reality of machine learning. Ideally, every row of data will
    have values for all columns. However, this is rarely the case. Most of the values
    are present in the MPG database. However, there are missing values in the horsepower
    column. A common practice is to replace missing values with the median value for that
    column. The program calculates the median. The following code 
    replaces any NA values in horsepower with the median:issing Values
    

In [5]:
import os
import pandas as pd
df = pd.read_csv("https://data.heatonresearch.com/data/t81-558/auto-mpg.csv",
                na_values=['NA', '?'])
print(f"horsepower has na? {pd.isnull(df['horsepower']).values.any()}")

print("Filling missing values...")
med = df ['horsepower'].median()
df['horsepower'] = df['horsepower'].fillna(med)

#df = df.dropna() # you can also simply drop NA values
print(f"horsepower has na? {pd.isnull(df['horsepower']).values.any()}")

horsepower has na? True
Filling missing values...
horsepower has na? False


### Dealing with outliers
    Outliers are values that are unusually high or low. Sometimes outliers are simply errors;
    this is a result of observation error. Outliers can also be truly large or small values 
    that may be difficult to address. We typically consider outliers to be a value that is 
    several standard deviations from the mean. The following function can remove such values.

In [6]:
# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)

### The code bellow will drop every row from the Auto MPG dataset where the horsepower is more than two standard deviations above or below the mean.

In [7]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore

df = pd.read_csv("https://data.heatonresearch.com/data/t81-558/auto-mpg.csv",
    na_values=['NA','?'])

# create feature vector
med = df['horsepower'].median()
df['horsepower'] = df['horsepower'].fillna(med)

# Drop the name column
df.drop('name',1,inplace=True)

# Drop outliers in horsepower
print("Length before MPG outliers droped dropped: {}".format(len(df)))
remove_outliers(df,'mpg',2)
print("Length after MPG outliers dropped: {}".format(len(df)))

pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 5)
display(df)

Length before MPG outliers droped dropped: 398
Length after MPG outliers dropped: 388


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,1
1,15.0,8,350.0,165.0,3693,11.5,70,1
...,...,...,...,...,...,...,...,...
396,28.0,4,120.0,79.0,2625,18.6,82,1
397,31.0,4,119.0,82.0,2720,19.4,82,1


# Dropping Fields
    Some fields are of no value the neural network should be dropped. The following code removes
    the name column from the MPG dataset

In [8]:
import os
import pandas as pd

df = pd.read_csv("https://data.heatonresearch.com/data/t81-558/auto-mpg.csv",
    na_values=['NA','?'])

print(f"Before drop:  {list(df.columns)}")
df.drop('name', 1, inplace=True)
print(f"After drop:{list(df.columns)}")

Before drop:  ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name']
After drop:['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin']


## Concatenating Rows and Columns
    Python can concatenate rows and columns together to form new data frames. 
    The code below creates a new data frame from the name and horsepower columns 
    from the Auto MPG dataset.
    The program does this by concatenating two columns together.

In [9]:
# Create new dataframe from name and horsepower

import os 
import pandas as pd

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv",
    na_values=['NA','?'])

col_horsepower = df['horsepower']
col_name = df['name']
result = pd.concat([col_name, col_horsepower], axis=1)

pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 5)
display(result)

Unnamed: 0,name,horsepower
0,chevrolet chevelle malibu,130.0
1,buick skylark 320,165.0
...,...,...
396,ford ranger,79.0
397,chevy s-10,82.0


#### The concat function can also concatnate tow rows together. This code concatenates the first two rows and the last two rows of the Auto MPG dataset

In [10]:
# Create new dataframe from first 2 rows and last 2 rows
import os 
import pandas as pd

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv",
    na_values=['NA','?'])

result = pd.concat([df[0:2], df[-2:]], axis=0)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 0)
display(result)

Unnamed: 0,mpg,cylinders,displacement,...,year,origin,name
0,18.0,8,307.0,...,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,...,70,1,buick skylark 320
396,28.0,4,120.0,...,82,1,ford ranger
397,31.0,4,119.0,...,82,1,chevy s-10
