# Preprocessing for Machine Learning 
* Standardizing Data
* Feature Engineering
*

# Standarization
for models who have high variance, continuous and on different scales, model linear space, linearly assumptions
* log normalization: high variance
* Scaling: different scales, linear chracteristics, 

In [1]:
import pandas as pd
import numpy as np
data_boston=pd.read_csv('HousingData.csv')
print(data_boston.columns)
data_boston.dropna(inplace=True)  #just for simplicity

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')


## log normalization

In [2]:
print('variance before log normalization',data_boston.TAX.var())
#lets reduce de variance of just one column (TAX) whic has he highest variance
data_boston['TAX_log']=np.log(data_boston.TAX)
print('variance after log normalization',data_boston.TAX_log.var())

variance before log normalization 28329.070355588272
variance after log normalization 0.15683393142975663


## Scaling

In [3]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
data_sclaled=pd.DataFrame(scaler.fit_transform(data_boston),columns=data_boston.columns)
print('variance before scaling',data_boston.TAX.var())
print('variance after scaling', data_sclaled.TAX.var())

variance before scaling 28329.070355588272
variance after scaling 1.0025445292620867


## Modeling data

In [4]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split 
X=data_boston.drop(columns=['MEDV'])
y=data_boston['MEDV']
knn=KNeighborsRegressor()

#### non-scaled data
# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y)

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train,y_train)

# Score the model on the test data
print('Score for non scaled data',knn.score(X_test,y_test))

#### scaled data
# Create the scaling method.
ss = StandardScaler()

# Apply the scaling method to the dataset used for modeling.
X_scaled = ss.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

# Fit the k-nearest neighbors model to the training data.
knn.fit(X_train,y_train)

# Score the model on the test data.
print('Score for scaled data',knn.score(X_test,y_test))

Score for non scaled data 0.6032485811083597
Score for scaled data 0.6464026125478213


# Feature engineering
* Creation of new features based on existing features 
* Insight into relationships between features 
* Extract and expand data 
* Dataset-dependent


## Encoding categorical variables 

### Binary encoding

In [5]:
from sklearn.preprocessing import LabelEncoder
hiking=pd.read_json('https://assets.datacamp.com/production/repositories/1816/datasets/4f26c48451bdbf73db8a58e226cd3d6b45cf7bb5/hiking.json')

# Set up the LabelEncoder object
enc = LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking['Accessible_enc']=enc.fit_transform(hiking['Accessible'])

# Compare the two columns
print(hiking[['Accessible','Accessible_enc']].head())

  Accessible  Accessible_enc
0          Y               1
1          N               0
2          N               0
3          N               0
4          N               0


### One-Hot Encoding

creates different columns for each categorical value from a feature

In [6]:
volunteer=pd.read_csv('https://assets.datacamp.com/production/repositories/1816/datasets/668b96955d8b252aa8439c7602d516634e3f015e/volunteer_opportunities.csv')
# Transform the category_desc column
category_enc = pd.get_dummies(volunteer["category_desc"])

# Take a look at the encoded columns
print(category_enc.head())

   Education  Emergency Preparedness  Environment  Health  \
0          0                       0            0       0   
1          0                       0            0       0   
2          0                       0            0       0   
3          0                       0            0       0   
4          0                       0            1       0   

   Helping Neighbors in Need  Strengthening Communities  
0                          0                          0  
1                          0                          1  
2                          0                          1  
3                          0                          1  
4                          0                          0  


## Numerical Values

In [7]:
# Create a list of the columns to average
run_columns = ["run1", "run2", "run3", "run4", "run5"]

# Use apply to create a mean column
#running_times_5k["mean"] = running_times_5k.apply(lambda row: row[run_columns].mean(), axis=1)

# Take a look at the results
#print(running_times_5k)

## FEature  engineering for text


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
# Take the title text
title_text = volunteer["title"]

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors

text_tfidf = tfidf_vec.fit_transform(title_text)


# Split the dataset according to the class distribution of category_desc
y = volunteer["category_desc"]

#X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y)

# Fit the model to the training data
#nb.fit(X_train,y_train)

# Print out the model's accuracy
#print(nb.score(X_test,y_test))

## Selecting features using vectors

# Complete pre-processing

In [13]:
ufo=pd.read_csv('https://assets.datacamp.com/production/repositories/1816/datasets/a5ebfe5d2ed194f2668867603b563963af4769e9/ufo_sightings_large.csv')

ufo=ufo.loc[~ufo.apply(lambda row: (row==0).all(), axis=1)]
# Check the column types
print(ufo.dtypes)

# Change the type of seconds to float
ufo["seconds"] = ufo["seconds"].astype(float)

# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo["date"])

# Check the column types
print(ufo[["seconds", "date"]].dtypes)

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object
seconds           float64
date       datetime64[ns]
dtype: object


In [14]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[['length_of_time','state','type']].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ ufo['length_of_time'].notnull() & 
          ufo['state'].notnull() & 
          ufo['type'].notnull()]

# Print out the shape of the new dataset
print('shape:', ufo_no_missing.shape)


length_of_time    143
state             419
type              159
dtype: int64
shape: (4283, 11)


In [15]:
import re
def return_minutes(time_string):
    
    # We'll use \d+ to grab digits and match it to the column values
    pattern = re.compile(r"\d+")
        
    # Use match on the pattern and column
    num = re.match(pattern, time_string)
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo['minutes']=ufo.length_of_time.str.extract('(\d+)').astype('float')

# Take a look at the head of both of the columns
print(ufo[["length_of_time", "minutes"]].head())

ufo.minutes.describe()


    length_of_time  minutes
0          2 weeks      2.0
1           30sec.     30.0
2              NaN      NaN
3  about 5 minutes      5.0
4                2      2.0


count    4443.000000
mean       11.455098
std        29.027451
min         0.000000
25%         2.000000
50%         5.000000
75%        15.000000
max      1640.000000
Name: minutes, dtype: float64

In [18]:
# Check the variance of the seconds and minutes columns
print(ufo[['seconds','minutes']].var())

# Log normalize the seconds column
ufo["seconds_log"] = np.log(ufo['seconds'])

# Print out the variance of just the seconds_log column
print(ufo["seconds_log"].var())

seconds    3.156735e+10
minutes    8.425929e+02
dtype: float64
nan


In [20]:
#FEATURE ENGINEERING
# Use Pandas to encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda x: 1 if x=='us' else 0 )

# Print the number of unique type values
print(len(ufo.type.unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo.type)

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo,type_set], axis=1)

22


In [21]:
# Look at the first 5 rows of the date column
print(ufo.date.head())

# Extract the month from the date column
ufo["month"] = ufo["date"].apply(lambda x: x.month)

# Extract the year from the date column
ufo["year"] = ufo["date"].apply(lambda x: x.year)

# Take a look at the head of all three columns
print(ufo[['date','month','year']].head())

0   2011-11-03 19:21:00
1   2004-10-03 19:05:00
2   2009-09-25 21:00:00
3   2002-11-21 05:45:00
4   2010-08-19 12:55:00
Name: date, dtype: datetime64[ns]
                 date  month  year
0 2011-11-03 19:21:00     11  2011
1 2004-10-03 19:05:00     10  2004
2 2009-09-25 21:00:00      9  2009
3 2002-11-21 05:45:00     11  2002
4 2010-08-19 12:55:00      8  2010


In [23]:
# Take a look at the head of the desc field
print(ufo["desc"].head())

# Create the tfidf vectorizer object
vec = TfidfVectorizer()

# Use vec's fit_transform method on the desc field
#desc_tfidf = vec.fit_transform(ufo["desc"])
##############------------ NO FUNCIONA PQ HAY NAN'S
# Look at the number of columns this creates
#print(desc_tfidf.shape)

0    Red blinking objects similar to airplanes or s...
1                 Many fighter jets flying towards UFO
2    Green&#44 red&#44 and blue pulses of light tha...
3    It was a large&#44 triangular shaped flying ob...
4       A white spinning disc in the shape of an oval.
Name: desc, dtype: object


In [27]:
# Check the correlation between the seconds, seconds_log, and minutes columns
print(ufo[["seconds", "seconds_log", "minutes"]].corr())

# Make a list of features to drop   
to_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"]

# Drop those features
ufo_dropped = ufo.drop(to_drop, axis=1)

# Let's also filter some words out of the text vector we created
#filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)

              seconds  seconds_log   minutes
seconds      1.000000     0.164613 -0.008100
seconds_log  0.164613     1.000000  0.108606
minutes     -0.008100     0.108606  1.000000
