<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#step-1:-Introduction-to-Data-Preprocessing" data-toc-modified-id="step-1:-Introduction-to-Data-Preprocessing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>step 1: Introduction to Data Preprocessing</a></span></li><li><span><a href="#step-2:-Standardizing-Data" data-toc-modified-id="step-2:-Standardizing-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>step 2: Standardizing Data</a></span></li><li><span><a href="#step-3:-Feature-Engineering" data-toc-modified-id="step-3:-Feature-Engineering-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>step 3: Feature Engineering</a></span></li><li><span><a href="#step-4:-Selecting-features-for-modeling" data-toc-modified-id="step-4:-Selecting-features-for-modeling-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>step 4: Selecting features for modeling</a></span></li><li><span><a href="#step-5:-Putting-it-all-together" data-toc-modified-id="step-5:-Putting-it-all-together-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>step 5: Putting it all together</a></span></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB 

In [2]:
hiking_url = 'https://assets.datacamp.com/production/course_6576/datasets/hiking.json'
wine_url = 'https://assets.datacamp.com/production/course_6576/datasets/wine_types.csv'
ufo_url = 'https://assets.datacamp.com/production/course_6576/datasets/ufo_sightings_large.csv'
volunteer_url = 'https://assets.datacamp.com/production/course_6576/datasets/volunteer_opportunities.csv'

hike = pd.read_json(hiking_url)
wine = pd.read_csv(wine_url, sep=',')
ufo = pd.read_csv(ufo_url, sep=',')
vol = pd.read_csv(volunteer_url, sep=',')

# step 1: Introduction to Data Preprocessing

missing data,
data types,
class distribution - [class imbalance, stratified sampling]

In [3]:
vol.shape

(665, 35)

In [4]:
vol.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,...,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,4996,37004,50,0,Volunteers Needed For Rise Up & Stay Put! Home...,737,Building on successful events last summer and ...,,,,...,July 30 2011,approved,,,,,,,,
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,...,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,...,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,...,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,...,February 05 2011,approved,,,,,,,,


In [5]:
vol.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 35 columns):
opportunity_id        665 non-null int64
content_id            665 non-null int64
vol_requests          665 non-null int64
event_time            665 non-null int64
title                 665 non-null object
hits                  665 non-null int64
summary               665 non-null object
is_priority           62 non-null object
category_id           617 non-null float64
category_desc         617 non-null object
amsl                  0 non-null float64
amsl_unit             0 non-null float64
org_title             665 non-null object
org_content_id        665 non-null int64
addresses_count       665 non-null int64
locality              595 non-null object
region                665 non-null object
postalcode            659 non-null float64
primary_loc           0 non-null float64
display_url           665 non-null object
recurrence_type       665 non-null object
hours                 

In [3]:
vol1 = vol.dropna(axis=1,thresh=3); #keep only the col with atleast n non-NA values
print(vol1.isnull().sum())
print('\n')
print('shape after thrashing cols with atleast 3 missing vals: ',vol1.shape)

opportunity_id          0
content_id              0
vol_requests            0
event_time              0
title                   0
hits                    0
summary                 0
is_priority           603
category_id            48
category_desc          48
org_title               0
org_content_id          0
addresses_count         0
locality               70
region                  0
postalcode              6
display_url             0
recurrence_type         0
hours                   0
created_date            0
last_modified_date      0
start_date_date         0
end_date_date           0
status                  0
dtype: int64


shape after thrashing cols with atleast 3 missing vals:  (665, 24)


In [4]:
vol['category_desc'].isnull().sum() #48 missing vaues in category_desc

48

In [5]:
vol2 = vol[vol['category_desc'].notnull()] # new dataframe with no missing values in category_desc
vol2.shape

(617, 35)

In [6]:
vol.dtypes

opportunity_id          int64
content_id              int64
vol_requests            int64
event_time              int64
title                  object
hits                    int64
summary                object
is_priority            object
category_id           float64
category_desc          object
amsl                  float64
amsl_unit             float64
org_title              object
org_content_id          int64
addresses_count         int64
locality               object
region                 object
postalcode            float64
primary_loc           float64
display_url            object
recurrence_type        object
hours                   int64
created_date           object
last_modified_date     object
start_date_date        object
end_date_date          object
status                 object
Latitude              float64
Longitude             float64
Community Board       float64
Community Council     float64
Census Tract          float64
BIN                   float64
BBL       

In [7]:
# assume hits is an object type - convert it to int type
vol['hits'] = vol.hits.astype('int')
vol.dtypes

opportunity_id          int64
content_id              int64
vol_requests            int64
event_time              int64
title                  object
hits                    int64
summary                object
is_priority            object
category_id           float64
category_desc          object
amsl                  float64
amsl_unit             float64
org_title              object
org_content_id          int64
addresses_count         int64
locality               object
region                 object
postalcode            float64
primary_loc           float64
display_url            object
recurrence_type        object
hours                   int64
created_date           object
last_modified_date     object
start_date_date        object
end_date_date          object
status                 object
Latitude              float64
Longitude             float64
Community Board       float64
Community Council     float64
Census Tract          float64
BIN                   float64
BBL       

In [8]:
# class distribution of category_desc - we are trying to predict this col using other features so make sure is not imbalanced
vol.category_desc.value_counts() # we see uneven distrubution in last 2 (occurs <50times)- solution stratified sampling

Strengthening Communities    307
Helping Neighbors in Need    119
Education                     92
Health                        52
Environment                   32
Emergency Preparedness        15
Name: category_desc, dtype: int64

In [9]:
volunteer_X = vol.drop('category_desc',axis=1)
volunteer_y = vol[['category_desc']]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(volunteer_X,volunteer_y,stratify=volunteer_y)
print('y_train: \n',y_train.category_desc.value_counts())
print('\n y_test: \n', y_test.category_desc.value_counts())

y_train: 
 Strengthening Communities    230
Helping Neighbors in Need     89
Education                     69
Health                        39
Environment                   24
Emergency Preparedness        11
Name: category_desc, dtype: int64

 y_test: 
 Strengthening Communities    77
Helping Neighbors in Need    30
Education                    23
Health                       13
Environment                   8
Emergency Preparedness        4
Name: category_desc, dtype: int64


# step 2: Standardizing Data

standardization is a preprocessing task performed on numerical, continuous data. Here we learn about standardizing data, log normalization, KNN on 
nonscale and scaled data

In [25]:
print(wine.head())

print('\n missing values: ',wine.isnull().sum())

   Type  Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  \
0     1    14.23        1.71  2.43               15.6        127   
1     1    13.20        1.78  2.14               11.2        100   
2     1    13.16        2.36  2.67               18.6        101   
3     1    14.37        1.95  2.50               16.8        113   
4     1    13.24        2.59  2.87               21.0        118   

   Total phenols  Flavanoids  Nonflavanoid phenols  Proanthocyanins  \
0           2.80        3.06                  0.28             2.29   
1           2.65        2.76                  0.26             1.28   
2           2.80        3.24                  0.30             2.81   
3           3.85        3.49                  0.24             2.18   
4           2.80        2.69                  0.39             1.82   

   Color intensity   Hue  OD280/OD315 of diluted wines  Proline  
0             5.64  1.04                          3.92     1065  
1             4.38  1.05        

In [26]:
wine.Type.value_counts()

2    71
1    59
3    48
Name: Type, dtype: int64

In [34]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

X = wine.drop(['Type', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
        'Magnesium', 'Flavanoids', 
        'Proanthocyanins', 'Color intensity', 
        'OD280/OD315 of diluted wines'],axis=1)
y= np.ravel(wine[['Type']])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=101)
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn.score(X_test,y_test)

0.6666666666666666

In [39]:
pred = knn.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

[[12  0  2]
 [ 0  9  4]
 [ 1  5  3]]
             precision    recall  f1-score   support

          1       0.92      0.86      0.89        14
          2       0.64      0.69      0.67        13
          3       0.33      0.33      0.33         9

avg / total       0.67      0.67      0.67        36



In [40]:
wine.var() #need to normalize proline, magnesium, alcalinity of ash. let's only foucs on relevant column - proline 

Type                                0.600679
Alcohol                             0.659062
Malic acid                          1.248015
Ash                                 0.075265
Alcalinity of ash                  11.152686
Magnesium                         203.989335
Total phenols                       0.391690
Flavanoids                          0.997719
Nonflavanoid phenols                0.015489
Proanthocyanins                     0.327595
Color intensity                     5.374449
Hue                                 0.052245
OD280/OD315 of diluted wines        0.504086
Proline                         99166.717355
dtype: float64

In [41]:
wine.describe()

Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,1.938202,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.775035,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,1.0,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,1.0,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,2.0,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,3.0,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,3.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [42]:
wine['Proline_log'] = np.log(wine.Proline) #log normalized the Proline col
wine.Proline_log.var()

0.17231366191842018

In [43]:
from sklearn.preprocessing import StandardScaler #scaling features that are on different scales 

scaler = StandardScaler()

# Take a subset of the DataFrame you want to scale. 
wine_subset = wine[['Ash','Alcalinity of ash','Magnesium']]

# Apply the scaler to the DataFrame subset.
wine_subset_scaled = scaler.fit_transform(wine_subset) #fit_tranform- fits the method to data as well transform the data in 1 step

In [45]:
#scaled 
X = wine.drop(['Type','Proline_log'],axis=1)
y= np.ravel(wine[['Type']])

X_train, X_test, y_train, y_test = train_test_split(scaler.fit_transform(X),y,random_state=101) #scaled X boosting acc
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn.score(X_test,y_test)

0.9555555555555556

In [46]:
#nonscaled
X = wine.drop(['Type','Proline_log'],axis=1)
y= np.ravel(wine[['Type']])

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=101) #scaled X boosting acc
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn.score(X_test,y_test)

0.6444444444444445

# step 3: Feature Engineering

create new useful features, how to encode, aggregate and extract info from both numerical as well textual features. feature eng, encode categorical variable,eng numerical features, text classification

In [13]:
print(hike.isnull().sum())
print('\n',hike.Accessible.head(4))

Accessible         0
Difficulty         6
Length             4
Limited_Access     0
Location           0
Name               0
Other_Details      2
Park_Name          0
Prop_ID            0
lat               33
lon               33
dtype: int64

 0    Y
1    N
2    N
3    N
Name: Accessible, dtype: object


In [50]:
# categorical variables encode-binary "Accessible"
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
hike['Accessible_enc'] = enc.fit_transform(hike.Accessible)
print(hike[['Accessible_enc','Accessible']].head())

   Accessible_enc Accessible
0               1          Y
1               0          N
2               0          N
3               0          N
4               0          N


In [14]:
#category_desc - categorical variable with more than 2 categories- one-hot encode this column numerically - get_dummies()
category_enc = pd.get_dummies(vol.category_desc)

category_enc.head()

Unnamed: 0,Education,Emergency Preparedness,Environment,Health,Helping Neighbors in Need,Strengthening Communities
0,0,0,0,0,0,0
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,1,0,0,0


In [15]:
vol['start_date_converted']=pd.to_datetime(vol['start_date_date']) #convert string to date column
vol['start_date_month'] = vol['start_date_converted'].apply(lambda row: row.month) #extract just the month
print(vol[['start_date_converted','start_date_month']].head())

  start_date_converted  start_date_month
0           2011-07-30                 7
1           2011-02-01                 2
2           2011-01-29                 1
3           2011-02-14                 2
4           2011-02-05                 2


In [57]:
#regular exp- \d+\.\d+ d-digit, + -as many as possible \. -decimal
print(hike.Length.head(2))

def return_mileage(length):
    pattern = re.compile(r"\d+\.\d+")

0    0.8 miles
1     1.0 mile
Name: Length, dtype: object


In [16]:
#eng features from strings using tf/idf
print(vol.title.head(2))
title_text = vol['title']
tfidf_vec = TfidfVectorizer() #create vectorizer method
text_tfidf = tfidf_vec.fit_transform(title_text)

0    Volunteers Needed For Rise Up & Stay Put! Home...
1                                         Web designer
Name: title, dtype: object


In [17]:
text_tfidf_array = text_tfidf.toarray()

In [36]:
#text classification using tf/idf vectors. Here we retrain the model just using new
# features we created in the last cell
text_tfidf_array = text_tfidf.toarray()
y = vol[['category_desc']]

X_train, X_test, y_train, y_test = train_test_split(text_tfidf_array,np.ravel(y),stratify=y)
#knn = KNeighborsClassifier()
#knn.fit(X_train,y_train)
#knn.score(X_test,y_test)

# step 4: Selecting features for modeling

Different techniques to select most imp features from dataset, how to drop redudant features, work with text vectors, and reduce number of features using PCA (principal component analysis). 
*feature selection, remove redundant features,select features using text vectors, dimensionality reduction

In [38]:
hike.head(2) #accessible, difficulty, length needs some feature eng but are good candidates for feature selection yet

Unnamed: 0,Accessible,Difficulty,Length,Limited_Access,Location,Name,Other_Details,Park_Name,Prop_ID,lat,lon
0,Y,,0.8 miles,N,"Enter behind the Salt Marsh Nature Center, loc...",Salt Marsh Nature Trail,<p>The first half of this mile-long trail foll...,Marine Park,B057,,
1,N,Easy,1.0 mile,N,Enter Park at Lincoln Road and Ocean Avenue en...,Lullwater,Explore the Lullwater to see how nature thrive...,Prospect Park,B073,,


In [62]:
#remove duplicates, correlated features, redundant feature can be a feature that has gone through the feature engineering process.
vol1 = vol2.dropna(axis=1,thresh=3)
vol_subset = vol1.drop(["category_desc", "created_date", "locality", "region", "vol_requests"],axis=1)

In [63]:
vol_subset.shape

(617, 19)

In [66]:
wine1 = wine.drop(['Type', 'Alcohol', 'Ash', 'Alcalinity of ash',
       'Magnesium', 'Nonflavanoid phenols',
       'Proanthocyanins', 'Color intensity', 'Proline'],axis=1)

In [69]:
wine1.corr() #remove correlated features. if rorr value >0.75->remove

Unnamed: 0,Malic acid,Total phenols,Flavanoids,Hue,OD280/OD315 of diluted wines
Malic acid,1.0,-0.335167,-0.411007,-0.561296,-0.36871
Total phenols,-0.335167,1.0,0.864564,0.433681,0.699949
Flavanoids,-0.411007,0.864564,1.0,0.543479,0.787194
Hue,-0.561296,0.433681,0.543479,1.0,0.565468
OD280/OD315 of diluted wines,-0.36871,0.699949,0.787194,0.565468,1.0


In [71]:
wine2 = wine1.drop('Flavanoids',axis=1) #corr value > .75 twice

In [74]:
from sklearn.decomposition import PCA

pca = PCA()
wine_x = wine.drop('Type',axis=1)
transfromed_x = pca.fit_transform(wine_x)
pca.explained_variance_ratio_ #check how much variance is explained by each component.

array([9.98091230e-01, 1.73591562e-03, 9.49589576e-05, 5.02173562e-05,
       1.23636847e-05, 8.46213034e-06, 2.80681456e-06, 1.52308053e-06,
       1.12783044e-06, 7.21415811e-07, 3.78060267e-07, 2.12013755e-07,
       8.25392788e-08])

# step 5: Putting it all together

try all the techniques on UFO sightings dataset

In [3]:
ufo.head(2)

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,11/3/2011 19:21,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,10/3/2004 19:05,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556


In [4]:
ufo.dtypes #convert date, split length_of_time and convert, convert desc, lat, long

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object

In [17]:
ufo.describe()

Unnamed: 0,seconds,long
count,4935.0,4935.0
mean,6682.665,-84.157158
std,177672.0,42.708607
min,0.0,-170.478889
25%,15.0,-111.904306
50%,120.0,-87.332778
75%,600.0,-77.460694
max,9468000.0,178.4419


In [5]:
#converting
ufo['seconds'] = ufo['seconds'].astype('float')
ufo['date'] = pd.to_datetime(ufo['date'])

ufo[['seconds','date']].dtypes

seconds           float64
date       datetime64[ns]
dtype: object

In [6]:
print(ufo.head(2))
print('\n shape:',ufo.shape)

                 date       city state country     type    seconds  \
0 2011-11-03 19:21:00  woodville    wi      us  unknown  1209600.0   
1 2004-10-03 19:05:00  cleveland    oh      us   circle       30.0   

  length_of_time                                               desc  \
0        2 weeks  Red blinking objects similar to airplanes or s...   
1         30sec.               Many fighter jets flying towards UFO   

     recorded         lat       long  
0  12/12/2011  44.9530556 -92.291111  
1  10/27/2004  41.4994444 -81.695556  

 shape: (4935, 11)


In [7]:
ufo.isnull().sum() #state, country, type, length_of_time have lots of missing values

date                0
city                9
state             419
country           680
type              159
seconds             0
length_of_time    143
desc                3
recorded            0
lat                 0
long                0
dtype: int64

In [8]:
#drop missing data
ufo_no_missing = ufo[ufo['length_of_time'].notnull() & #keep only rows where these are not null
                    ufo['type'].notnull() &
                    ufo['state'].notnull()]
ufo_no_missing.isnull().sum()

date                0
city                0
state               0
country           392
type                0
seconds             0
length_of_time      0
desc                0
recorded            0
lat                 0
long                0
dtype: int64

In [9]:
ufo_no_missing.shape

(4283, 11)

In [10]:
def return_mins(time_string):
    pattern = re.compile(r'\d+')
    num = re.match(pattern,time_string)
    if num is not None:
        return int(num.group(0))

In [14]:
ufo_no_missing['minutes'] = ufo_no_missing['length_of_time'].apply(lambda row: return_mins(row))
ufo.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556


In [16]:
ufo_no_missing.describe()

Unnamed: 0,seconds,long,minutes
count,4283.0,4283.0,3778.0
mean,5309.563,-94.406454,11.376654
std,124306.5,20.297169,30.774303
min,0.0,-170.478889,0.0
25%,20.0,-113.993056,2.0
50%,180.0,-88.987778,5.0
75%,600.0,-80.118357,15.0
max,6312000.0,117.897392,1640.0


In [18]:
ufo_no_missing['minutes'] = ufo_no_missing['minutes'].notnull()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
ufo_no_missing.isnull().sum()

date                0
city                0
state               0
country           392
type                0
seconds             0
length_of_time      0
desc                0
recorded            0
lat                 0
long                0
minutes             0
dtype: int64

In [20]:
#standardization
ufo_no_missing['lat'] = ufo_no_missing['lat'].astype('float')
ufo_no_missing['minutes'] = ufo_no_missing['minutes'].astype('float')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
ufo_no_missing.dtypes

date              datetime64[ns]
city                      object
state                     object
country                   object
type                      object
seconds                  float64
length_of_time            object
desc                      object
recorded                  object
lat                      float64
long                     float64
minutes                  float64
dtype: object

In [22]:
ufo_no_missing.var()

seconds    1.545212e+10
lat        4.371991e+01
long       4.119751e+02
minutes    1.040300e-01
dtype: float64

In [24]:
ufo_no_missing["seconds_log"] = np.log(ufo_no_missing.seconds)

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [26]:
ufo_no_missing.columns

Index(['date', 'city', 'state', 'country', 'type', 'seconds', 'length_of_time',
       'desc', 'recorded', 'lat', 'long', 'minutes', 'seconds_log'],
      dtype='object')

In [33]:
ufo_no_missing.seconds_log.head(2)

0    14.005800
1     3.401197
Name: seconds_log, dtype: float64

In [34]:
#feature eng - encode categorical variables: country,type,dates
ufo_no_missing.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long,minutes,seconds_log
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.953056,-92.291111,1.0,14.0058
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.499444,-81.695556,1.0,3.401197
3,2002-11-21 05:45:00,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.021389,-80.382222,0.0,5.703782
4,2010-08-19 12:55:00,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333,1.0,-inf
5,2012-06-16 23:00:00,san diego,ca,us,light,600.0,10 minutes,Dancing lights that would fly around and then ...,7/4/2012,32.715278,-117.156389,1.0,6.39693


In [36]:
####country
ufo_no_missing['country_enc'] = ufo_no_missing['country'].apply(lambda val: 1 if val =='us' else 0)
len(ufo_no_missing.type.unique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


21

In [38]:
#####type
type_set = pd.get_dummies(ufo_no_missing.type)
ufo1 = pd.concat([ufo_no_missing,type_set],axis=1)
ufo1.head(2)

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,...,flash,formation,light,other,oval,rectangle,sphere,teardrop,triangle,unknown
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.953056,...,0,0,0,0,0,0,0,0,0,1
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.499444,...,0,0,0,0,0,0,0,0,0,0


In [40]:
####date
ufo1["month"] = ufo1["date"].apply(lambda val: val.month)
ufo1["year"] = ufo1["date"].apply(lambda val:val.year)
ufo1[['month','year','date']].head(2)

Unnamed: 0,month,year,date
0,11,2011,2011-11-03 19:21:00
1,10,2004,2004-10-03 19:05:00


In [42]:
####text vectorization using tfidf- desc
vec = TfidfVectorizer()
desc_tfidf = vec.fit_transform(ufo1.desc)
desc_tfidf.shape


(4283, 5754)

In [46]:
ufo1.columns

Index(['date', 'city', 'state', 'country', 'type', 'seconds', 'length_of_time',
       'desc', 'recorded', 'lat', 'long', 'minutes', 'seconds_log',
       'country_enc', 'changing', 'chevron', 'cigar', 'circle', 'cone',
       'cross', 'cylinder', 'diamond', 'disk', 'egg', 'fireball', 'flash',
       'formation', 'light', 'other', 'oval', 'rectangle', 'sphere',
       'teardrop', 'triangle', 'unknown', 'month', 'year'],
      dtype='object')

In [49]:
#feature selection - (ideal dataset- drop location based cols os city; we have month and year so drop date, recorded
#vectorized desc so drop;drop secs, mins,leng-of-time), country, lat,long,state, modeling the ufo dataset
ufo1[['seconds','seconds_log','minutes']].corr()

Unnamed: 0,seconds,seconds_log,minutes
seconds,1.0,0.174331,-0.054263
seconds_log,0.174331,1.0,0.078156
minutes,-0.054263,0.078156,1.0


In [51]:
ufo1.describe()

Unnamed: 0,seconds,lat,long,minutes,seconds_log,country_enc,changing,chevron,cigar,circle,...,light,other,oval,rectangle,sphere,teardrop,triangle,unknown,month,year
count,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,...,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0
mean,5309.563,38.554724,-94.406454,0.882092,-inf,0.862013,0.028485,0.010507,0.020546,0.097595,...,0.214336,0.070978,0.046463,0.015877,0.065141,0.010507,0.103199,0.074481,6.939528,2003.8954
std,124306.5,6.612103,20.297169,0.322537,,0.344927,0.166372,0.101974,0.141876,0.296801,...,0.410409,0.256819,0.21051,0.125013,0.246804,0.101974,0.304254,0.262582,3.210479,10.590981
min,0.0,-27.42395,-170.478889,0.0,-inf,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1931.0
25%,20.0,34.230833,-113.993056,1.0,2.995732,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2001.0
50%,180.0,39.414167,-88.987778,1.0,5.192957,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,2007.0
75%,600.0,42.445556,-80.118357,1.0,6.39693,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,2011.0
max,6312000.0,64.282327,117.897392,1.0,15.657963,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,2014.0


In [55]:
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    
    # Let's transform that zipped dict into a series
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    
    # Let's sort the series to pull out the top n weighted words
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    return [original_vocab[i] for i in zipped_index]

def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
    
        # Here we'll call the function from the previous exercise, and extend the list we're creating
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)

In [57]:
to_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"]
ufo2 = ufo1.drop(to_drop, axis=1)
#filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4) --- didnt seem to work

In [62]:
ufo2.columns

Index(['type', 'seconds_log', 'country_enc', 'changing', 'chevron', 'cigar',
       'circle', 'cone', 'cross', 'cylinder', 'diamond', 'disk', 'egg',
       'fireball', 'flash', 'formation', 'light', 'other', 'oval', 'rectangle',
       'sphere', 'teardrop', 'triangle', 'unknown', 'month', 'year'],
      dtype='object')

In [83]:
ufo2.seconds_log.mean()

-inf

In [99]:
ufo2['seconds_logs'] = ufo2['seconds_log'].replace(-np.inf,np.nan)
ufo2.dropna(inplace=True)

In [100]:
ufo2.isnull().sum()

type            0
seconds_log     0
country_enc     0
changing        0
chevron         0
cigar           0
circle          0
cone            0
cross           0
cylinder        0
diamond         0
disk            0
egg             0
fireball        0
flash           0
formation       0
light           0
other           0
oval            0
rectangle       0
sphere          0
teardrop        0
triangle        0
unknown         0
month           0
year            0
seconds_logs    0
dtype: int64

In [120]:
ufo1['seconds_log'] = ufo1['seconds_log'].replace(-np.inf,np.nan)
print(ufo1.seconds_log.head(5))

0    14.005800
1     3.401197
3     5.703782
4          NaN
5     6.396930
Name: seconds_log, dtype: float64


In [122]:
ufo1['seconds_logs'] = ufo1.seconds_log.fillna(np.mean(ufo1.seconds_log))
print(ufo1.seconds_logs.head(5))

0    14.005800
1     3.401197
3     5.703782
4     4.954833
5     6.396930
Name: seconds_logs, dtype: float64


In [123]:
X = ufo1[['seconds_logs', 'changing', 'chevron', 'cigar', 'circle', 'cone',
       'cross', 'cylinder', 'diamond', 'disk', 'egg', 'fireball', 'flash',
       'formation', 'light', 'other', 'oval', 'rectangle', 'sphere',
       'teardrop', 'triangle', 'unknown', 'month', 'year']]
y = ufo2[['country_enc']]

In [124]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

train_X, test_X, train_y, test_y = train_test_split(X,np.ravel(y),stratify=y1)
knn = KNeighborsClassifier()
knn.fit(train_X,train_y)
print(knn.score(test_X,test_y))

0.8525390625
