In [312]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

In [313]:
volunteer = pd.read_csv('Datasets/volunteer_opportunities.csv')
volunteer.shape

(665, 35)

## 1) Missing data

### How many features are in the set after columns with at least 3 missing values are removed?

In [314]:
volunteer.dropna(thresh=3, axis=1, inplace=True)
volunteer.shape

(665, 24)

### We want to drop rows where the category_desc column values are missing.

In [315]:
# Check how many values are missing in the category_desc column
print(volunteer['category_desc'].isnull().sum())

# Subset the volunteer dataset
volunteer_subset = volunteer[volunteer['category_desc'].notnull()]

# Print out the shape of the subset
print(volunteer_subset.shape)

48
(617, 24)


## 2)Converting a column type

In [316]:
# Look at the dtypes of the dataset
print(volunteer.dtypes)

# Print the head of the hits column
print(volunteer["is_priority"].head())

# Convert the hits column to type int
volunteer["hits"] = volunteer.hits.astype('float64')

opportunity_id          int64
content_id              int64
vol_requests            int64
event_time              int64
title                  object
hits                    int64
summary                object
is_priority            object
category_id           float64
category_desc          object
org_title              object
org_content_id          int64
addresses_count         int64
locality               object
region                 object
postalcode            float64
display_url            object
recurrence_type        object
hours                   int64
created_date           object
last_modified_date     object
start_date_date        object
end_date_date          object
status                 object
dtype: object
0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
Name: is_priority, dtype: object


## 3) Standardization

In [317]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

wine = pd.read_csv('Datasets/wine_types.csv')
y = wine.iloc[:, 0].values
X = wine.iloc[:, 1:].values

# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [318]:
# Create a KMeans instance with k clusters: model
knn = KNeighborsClassifier(n_neighbors=6)

# Fit model to samples
knn.fit(X_train, y_train)

# Score the model on the test data
print(knn.score(X_test, y_test))

0.7777777777777778


#### You can see that the accuracy score is pretty low. Because some columns extremely high variance. Let's explore methods to improve this score. 

In [319]:
wine.var()

Type                                0.600679
Alcohol                             0.659062
Malic acid                          1.248015
Ash                                 0.075265
Alcalinity of ash                  11.152686
Magnesium                         203.989335
Total phenols                       0.391690
Flavanoids                          0.997719
Nonflavanoid phenols                0.015489
Proanthocyanins                     0.327595
Color intensity                     5.374449
Hue                                 0.052245
OD280/OD315 of diluted wines        0.504086
Proline                         99166.717355
dtype: float64

#### The Proline column has an extremely high variance.

In [320]:
# Apply the log normalization function to the Proline column
wine['Proline'] = np.log(wine['Proline'])

# Check the variance of the normalized Proline column
print(wine.Proline.var())

0.17231366191842012


### Standard Scaler for continuous values

In [321]:
# Import StandardScaler from scikit-learn
from sklearn.preprocessing import StandardScaler

# Create the scaler
ss = StandardScaler()

# Take a subset of the DataFrame you want to scale 
wine_subset = wine[['Ash', 'Alcalinity of ash', 'Magnesium']]

# Apply the scaler to the DataFrame subset
wine[['Ash', 'Alcalinity of ash', 'Magnesium']] = ss.fit_transform(wine_subset)

In [322]:
y = wine.iloc[:, 0].values
X = wine.iloc[:, 1:].values

# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Create a KMeans instance with k clusters: model
knn = KNeighborsClassifier(n_neighbors=6)

# Fit model to samples
knn.fit(X_train, y_train)

# Score the model on the test data
print(knn.score(X_test, y_test))

0.9777777777777777


#### The increase in accuracy is worth the extra step of scaling the dataset.

### 4) Feature Engneering

In [323]:
volunteer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   opportunity_id      665 non-null    int64  
 1   content_id          665 non-null    int64  
 2   vol_requests        665 non-null    int64  
 3   event_time          665 non-null    int64  
 4   title               665 non-null    object 
 5   hits                665 non-null    float64
 6   summary             665 non-null    object 
 7   is_priority         62 non-null     object 
 8   category_id         617 non-null    float64
 9   category_desc       617 non-null    object 
 10  org_title           665 non-null    object 
 11  org_content_id      665 non-null    int64  
 12  addresses_count     665 non-null    int64  
 13  locality            595 non-null    object 
 14  region              665 non-null    object 
 15  postalcode          659 non-null    float64
 16  display_

#### title, created_date, category_desc. All three of these columns will require some feature engineering before modeling.

### get_dummies()

In [324]:
# Transform the category_desc column
category_enc = pd.get_dummies(volunteer["category_desc"])

# Take a look at the encoded columns
print(category_enc.head())

   Education  Emergency Preparedness  Environment  Health  \
0          0                       0            0       0   
1          0                       0            0       0   
2          0                       0            0       0   
3          0                       0            0       0   
4          0                       0            1       0   

   Helping Neighbors in Need  Strengthening Communities  
0                          0                          0  
1                          0                          1  
2                          0                          1  
3                          0                          1  
4                          0                          0  


### label encoder( 0 or 1 )

In [325]:
hiking = pd.read_json('Datasets/hiking.json')

In [326]:
uniques = pd.DataFrame()
for col in hiking:
    col_uniques = pd.DataFrame({f'{col}_value': hiking[f'{col}'].value_counts().index,
                                f'{col}_count': hiking[f'{col}'].value_counts().values})
    uniques = pd.concat([uniques, col_uniques], axis = 1)

print('Value counts:')
uniques.head(7)

Value counts:


Unnamed: 0,Prop_ID_value,Prop_ID_count,Name_value,Name_count,Location_value,Location_count,Park_Name_value,Park_Name_count,Length_value,Length_count,...,Other_Details_value,Other_Details_count,Accessible_value,Accessible_count,Limited_Access_value,Limited_Access_count,lat_value,lat_count,lon_value,lon_count
0,R013,5.0,Kazimiroff Trail,1,Enter Park at Lincoln Road and Ocean Avenue en...,3.0,Van Cortlandt Park,5.0,1.5 miles,3.0,...,,2.0,N,32.0,N,32.0,,,,
1,X092,5.0,Blue Trail,1,200 feet ahead of parking lot at the terminus ...,2.0,La Tourette Parks & Golf Course,5.0,0.5 miles,3.0,...,This is the Greenbelt&rsquo;s longest marked t...,2.0,Y,1.0,Y,1.0,,,,
2,B073,4.0,John Muir Trail,1,Willowbrook Park off Victory Boulevard,1.0,Prospect Park,4.0,0.75 miles,3.0,...,Connects to Great Kills Park of the Gateway Na...,2.0,,,,,,,,
3,Q015,3.0,Clove Lakes Park Trail,1,Richmond Road and St. Patrick's Place,1.0,Forest Park,3.0,1.0 mile,2.0,...,Take a step back in time and imagine Manhattan...,1.0,,,,,,,,
4,R088,2.0,Orange Trail,1,Page Avenue & Eugene Street,1.0,High Rock Park,2.0,7.6 miles,2.0,...,Step back in time with a walk through Brooklyn...,1.0,,,,,,,,
5,X039,1.0,Midwood,1,Staten Island Boulevard,1.0,Cunningham Park,1.0,12.3 miles,2.0,...,This gentle walk takes you through a forest of...,1.0,,,,,,,,
6,R115,1.0,Willowbrook Park White Trail,1,Enter the park at Van Cortlandt Park South and...,1.0,Arden Woods,1.0,3.0 miles,2.0,...,This trail will lead you through a 2.4 mile ad...,1.0,,,,,,,,


In [327]:
from sklearn.preprocessing import LabelEncoder

# Set up the LabelEncoder object
enc = LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking['Accessible_enc'] = enc.fit_transform(hiking['Accessible'])

# Compare the two columns
print(hiking[['Accessible', 'Accessible_enc']].head())

  Accessible  Accessible_enc
0          Y               1
1          N               0
2          N               0
3          N               0
4          N               0


### Engineering numerical features - datetime

In [328]:
# First, convert string column to date column
volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"])

# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer["start_date_converted"].apply(lambda row: row.month)

# Take a look at the converted and new month columns
print(volunteer[['start_date_converted', 'start_date_month']].head())

  start_date_converted  start_date_month
0           2011-07-30                 7
1           2011-02-01                 2
2           2011-01-29                 1
3           2011-02-14                 2
4           2011-02-05                 2


### Text Classification

In [329]:
hiking["Length"] = hiking["Length"].astype('str')

In [330]:
# Write a pattern to extract numbers and decimals
def return_mileage(length):
    pattern = re.compile(r"\d+\.\d+")
    
    # Search the text for matches
    mile = re.match(pattern, length)
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking["Length"].apply(lambda row: return_mileage(row) )
print(hiking[["Length", "Length_num"]].head())

       Length  Length_num
0   0.8 miles        0.80
1    1.0 mile        1.00
2  0.75 miles        0.75
3   0.5 miles        0.50
4   0.5 miles        0.50


## 5) Selecting relevant features
Now let's identify the redundant columns in the volunteer dataset and perform feature selection on the dataset to return a DataFrame of the relevant features.
<br>
<br>For example, if you explore the volunteer dataset in the console, you'll see three features which are related to location: locality, region, and postalcode. They contain repeated information, so it would make sense to keep only one of the features.
<br>
<br>There are also features that have gone through the feature engineering process: columns like Education and Emergency Preparedness are a product of encoding the categorical variable category_desc, so category_desc itself is redundant now.

In [331]:
# Create a list of redundant column names to drop
to_drop = ["category_desc", "created_date", "locality", "region", "vol_requests"]

# Drop those columns from the dataset
volunteer_subset = volunteer.drop(to_drop, axis=1)

# Print out the head of the new dataset
print(volunteer_subset.head(1))

   opportunity_id  content_id  event_time  \
0            4996       37004           0   

                                               title   hits  \
0  Volunteers Needed For Rise Up & Stay Put! Home...  737.0   

                                             summary is_priority  category_id  \
0  Building on successful events last summer and ...         NaN          NaN   

                      org_title  org_content_id  ...  postalcode  \
0  Center For NYC Neighborhoods            4426  ...         NaN   

           display_url recurrence_type hours  last_modified_date  \
0  /opportunities/4996         onetime     0        June 23 2011   

  start_date_date end_date_date    status start_date_converted  \
0    July 30 2011  July 30 2011  approved           2011-07-30   

  start_date_month  
0                7  

[1 rows x 21 columns]


In [332]:
wine.corr()

Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
Type,1.0,-0.328222,0.437776,-0.049643,0.517859,-0.209179,-0.719163,-0.847498,0.489109,-0.49913,0.265668,-0.617369,-0.78823,-0.569246
Alcohol,-0.328222,1.0,0.094397,0.211545,-0.310235,0.270798,0.289101,0.236815,-0.155929,0.136698,0.546364,-0.071747,0.072343,0.637325
Malic acid,0.437776,0.094397,1.0,0.164045,0.2885,-0.054575,-0.335167,-0.411007,0.292977,-0.220746,0.248985,-0.561296,-0.36871,-0.152643
Ash,-0.049643,0.211545,0.164045,1.0,0.443367,0.286587,0.12898,0.115077,0.18623,0.009652,0.258887,-0.074667,0.003911,0.238394
Alcalinity of ash,0.517859,-0.310235,0.2885,0.443367,1.0,-0.083333,-0.321113,-0.35137,0.361922,-0.197327,0.018732,-0.273955,-0.276769,-0.416897
Magnesium,-0.209179,0.270798,-0.054575,0.286587,-0.083333,1.0,0.214401,0.195784,-0.256294,0.236441,0.19995,0.055398,0.066004,0.424006
Total phenols,-0.719163,0.289101,-0.335167,0.12898,-0.321113,0.214401,1.0,0.864564,-0.449935,0.612413,-0.055136,0.433681,0.699949,0.431205
Flavanoids,-0.847498,0.236815,-0.411007,0.115077,-0.35137,0.195784,0.864564,1.0,-0.5379,0.652692,-0.172379,0.543479,0.787194,0.410494
Nonflavanoid phenols,0.489109,-0.155929,0.292977,0.18623,0.361922,-0.256294,-0.449935,-0.5379,1.0,-0.365845,0.139057,-0.26264,-0.50327,-0.275675
Proanthocyanins,-0.49913,0.136698,-0.220746,0.009652,-0.197327,0.236441,0.612413,0.652692,-0.365845,1.0,-0.02525,0.295544,0.519067,0.290203


### Checking for correlated features
Run Pearson's correlation coefficient on the dataset to determine which columns are good candidates for eliminating.
<br>Take a minute to look at the correlations. Identify a column where the correlation value is greater than 0.75 at least twice and store it in the to_drop variable.

In [333]:
# Print out the column correlations of the wine dataset
print(wine.corr())

                                  Type   Alcohol  Malic acid       Ash  \
Type                          1.000000 -0.328222    0.437776 -0.049643   
Alcohol                      -0.328222  1.000000    0.094397  0.211545   
Malic acid                    0.437776  0.094397    1.000000  0.164045   
Ash                          -0.049643  0.211545    0.164045  1.000000   
Alcalinity of ash             0.517859 -0.310235    0.288500  0.443367   
Magnesium                    -0.209179  0.270798   -0.054575  0.286587   
Total phenols                -0.719163  0.289101   -0.335167  0.128980   
Flavanoids                   -0.847498  0.236815   -0.411007  0.115077   
Nonflavanoid phenols          0.489109 -0.155929    0.292977  0.186230   
Proanthocyanins              -0.499130  0.136698   -0.220746  0.009652   
Color intensity               0.265668  0.546364    0.248985  0.258887   
Hue                          -0.617369 -0.071747   -0.561296 -0.074667   
OD280/OD315 of diluted wines -0.788230

In [334]:
# Take a minute to find the column where the correlation value is greater than 0.75 at least twice
to_drop = "Flavanoids"

# Drop that column from the DataFrame
wine = wine.drop(to_drop, axis=1)

#### Dropping correlated features is often an iterative process, so you may need to try different combinations in your model.

## 6) Selecting features using text vectors

In [335]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Take the title text
title_text = volunteer["title"]

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)

In [336]:
vocab = {}
with open('Datasets/vocab.txt', 'r') as file:
    text = file.readline()
    while text != '':
        text_list = text.split(':')
        key = int(text_list[0])
        value = text_list[1].split('\n')[0]
        vocab[key] = value
        text = file.readline()

In [337]:
# Add in the rest of the parameters
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    
    # Let's transform that zipped dict into a series
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    
    # Let's sort the series to pull out the top n weighted words
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    return [original_vocab[i] for i in zipped_index]

# Print out the weighted words
#to grab the 9th row, and setting top_n=3, to grab the top 3 weighted words.
print(return_weights(vocab, tfidf_vec.vocabulary_, text_tfidf, 8, 3)) 

[201, 27, 590]


In [338]:
def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
    
        # Here we'll call the function from the previous exercise, and extend the list we're creating
        try:
            filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        except:
            continue
        filter_list.extend(filtered)
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)

# Call the function to get the list of word indices
filtered_words = words_to_filter(vocab, tfidf_vec.vocabulary_, text_tfidf, 3)
print(len(filtered_words))

# By converting filtered_words back to a list, we can use it to filter the columns in the text vector
filtered_text = text_tfidf[:, list(filtered_words)]
print(filtered_text.shape)

836
(665, 836)


In [None]:
# Split the dataset according to the class distribution of category_desc
train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)

# Fit the model to the training data
nb.fit(train_X, train_y)

# Print out the model's accuracy
print(nb.score(test_X, test_y))

## 7) Dimensionality reduction

In [340]:
from sklearn.decomposition import PCA

# Set up PCA and the X vector for diminsionality reduction
pca = PCA()
wine_X = wine.drop("Type", axis=1)

# Apply PCA to the wine dataset X vector
transformed_X = pca.fit_transform(wine_X)

# Look at the percentage of variance explained by the different components
print(pca.explained_variance_ratio_)

[0.50688767 0.16256262 0.11618906 0.07365662 0.05773053 0.03415481
 0.0212432  0.01147232 0.00811959 0.0053799  0.00183666 0.00076702]


In [341]:
from sklearn.neighbors import KNeighborsClassifier

# Split the transformed X and the y labels into training and test sets
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(transformed_X, y)

knn = KNeighborsClassifier()

# Fit knn to the training data
knn.fit(X_wine_train, y_wine_train)

# Score knn on the test data and print it out
print(knn.score(X_wine_test, y_wine_test))

0.9111111111111111


#### PCA is a decent choice for the wine dataset.

## 8) Case Study 

In [342]:
ufo = pd.read_csv('Datasets/ufo_sightings_large.csv')
ufo.head(1)

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,11/3/2011 19:21,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111


### Checking column types

In [343]:
ufo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4935 entries, 0 to 4934
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            4935 non-null   object 
 1   city            4926 non-null   object 
 2   state           4516 non-null   object 
 3   country         4255 non-null   object 
 4   type            4776 non-null   object 
 5   seconds         4935 non-null   float64
 6   length_of_time  4792 non-null   object 
 7   desc            4932 non-null   object 
 8   recorded        4935 non-null   object 
 9   lat             4935 non-null   object 
 10  long            4935 non-null   float64
dtypes: float64(2), object(9)
memory usage: 424.2+ KB


In [344]:
ufo.date = pd.to_datetime(ufo.date)
ufo.date.head(1)

0   2011-11-03 19:21:00
Name: date, dtype: datetime64[ns]

### Dropping missing data

In [345]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[['length_of_time', 'state', 'type']].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ufo.length_of_time.notnull() & 
          ufo.state.notnull() & 
          ufo.type.notnull()]

# Print out the shape of the new dataset
print(ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


### Extracting numbers from strings

In [346]:
def return_minutes(time_string):

    # Use \d+ to grab digits
    pattern = re.compile(r"\d+")
    
    # Use match on the pattern and column
    num = re.match(pattern, str(time_string))
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo["minutes"] = ufo["length_of_time"].apply(return_minutes)

# Take a look at the head of both of the columns
print(ufo[['minutes', 'length_of_time']].head())

   minutes   length_of_time
0      2.0          2 weeks
1     30.0           30sec.
2      NaN              NaN
3      NaN  about 5 minutes
4      2.0                2


### Identifying features for standardization

In [347]:
# Check the variance of the seconds and minutes columns
print(ufo[["seconds", "minutes"]].var())

# Log normalize the seconds column
ufo["seconds_log"] = round(np.log(ufo['seconds']), 4)

# Print out the variance of just the seconds_log column
print(ufo["seconds_log"].var())

seconds    3.156735e+10
minutes    8.709933e+02
dtype: float64
nan


  result = getattr(ufunc, method)(*inputs, **kwargs)


### Engineering new features

In [348]:
# Use Pandas to encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda x : 1 if x=='us' else 0)

# Print the number of unique type values
print(len(ufo.type.unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo.type)

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo, type_set], axis=1)

22


### Features from dates

In [349]:
# Look at the first 5 rows of the date column
print(ufo.date.head(1))

# Extract the month from the date column
ufo["month"] = ufo["date"].apply(lambda x : x.month)

# Extract the year from the date column
ufo["year"] = ufo["date"].apply(lambda x : x.year)

# Take a look at the head of all three columns
print(ufo[['date', 'month', 'year']].head(1))

0   2011-11-03 19:21:00
Name: date, dtype: datetime64[ns]
                 date  month  year
0 2011-11-03 19:21:00     11  2011


### Text vectorization

In [350]:
ufo["desc"].isnull().sum()

3

In [351]:
ufo.desc.replace(np.nan, "empty", inplace=True)

In [352]:
# Take a look at the head of the desc field
print(ufo["desc"].head(1))

# Create the tfidf vectorizer object
vec = TfidfVectorizer()

# Use vec's fit_transform method on the desc field
desc_tfidf = vec.fit_transform(ufo["desc"])

# Look at the number of columns this creates
print(desc_tfidf.shape)

0    Red blinking objects similar to airplanes or s...
Name: desc, dtype: object
(4935, 6433)


#### You'll notice that the text vector has a large number of columns. We'll work on selecting the features we want to use for modeling in the next section.

### Selecting the ideal dataset

In [353]:
vocab2 = {}
with open('Datasets/vocab2.txt', 'r') as file:
    text = file.readline()
    while text != '':
        text_list = text.split(':')
        key = int(text_list[0])
        value = text_list[1].split('\n')[0]
        vocab[key] = value
        text = file.readline()

In [354]:
# Check the correlation between the seconds, seconds_log, and minutes columns
print(ufo[["seconds", "seconds_log", "minutes"]].corr())

# Make a list of features to drop   
to_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"]

# Drop those features
ufo_dropped = ufo.drop(to_drop, axis=1)

# Let's also filter some words out of the text vector we created
filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)

print(len(filtered_words))

              seconds  seconds_log   minutes
seconds      1.000000     0.164613 -0.008161
seconds_log  0.164613     1.000000  0.110072
minutes     -0.008161     0.110072  1.000000
55


In [355]:
ufo_dropped = ufo_dropped.drop(['type'], axis=1)

In [356]:
ufo_dropped.seconds_log.replace(float('-inf'), -46.05, inplace=True)
ufo_dropped.seconds_log.replace(float('inf'), 46.05, inplace=True)

In [357]:
X = ufo_dropped.drop(['country_enc'], axis=1)
y = ufo_dropped.country_enc

In [358]:
ufo_dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4935 entries, 0 to 4934
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   seconds_log  4935 non-null   float64
 1   country_enc  4935 non-null   int64  
 2   changing     4935 non-null   uint8  
 3   chevron      4935 non-null   uint8  
 4   cigar        4935 non-null   uint8  
 5   circle       4935 non-null   uint8  
 6   cone         4935 non-null   uint8  
 7   cross        4935 non-null   uint8  
 8   cylinder     4935 non-null   uint8  
 9   diamond      4935 non-null   uint8  
 10  disk         4935 non-null   uint8  
 11  egg          4935 non-null   uint8  
 12  fireball     4935 non-null   uint8  
 13  flash        4935 non-null   uint8  
 14  formation    4935 non-null   uint8  
 15  light        4935 non-null   uint8  
 16  other        4935 non-null   uint8  
 17  oval         4935 non-null   uint8  
 18  rectangle    4935 non-null   uint8  
 19  sphere

In [359]:
from sklearn.neighbors import KNeighborsClassifier

# Take a look at the features in the X set of data
print(X.columns)

# Split the X and y sets using train_test_split, setting stratify=y
train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y)

knn = KNeighborsClassifier()

# Fit knn to the training sets
knn.fit(train_X, train_y)

# Print the score of knn on the test sets
print(knn.score(test_X, test_y))

Index(['seconds_log', 'changing', 'chevron', 'cigar', 'circle', 'cone',
       'cross', 'cylinder', 'diamond', 'disk', 'egg', 'fireball', 'flash',
       'formation', 'light', 'other', 'oval', 'rectangle', 'sphere',
       'teardrop', 'triangle', 'unknown', 'month', 'year'],
      dtype='object')
0.7495948136142626


####  This model performs pretty well. It seems like we've made pretty good feature selection choices here.

In [360]:
from sklearn.naive_bayes import GaussianNB

# Use the list of filtered words we created to filter the text vector
filtered_text = desc_tfidf[:, list(filtered_words)]
# Split the X and y sets using train_test_split, setting stratify=y 
train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)

nb = GaussianNB()
# Fit nb to the training sets
nb.fit(train_X, train_y)

# Print the score of nb on the test sets
print(nb.score(test_X, test_y))

0.22447325769854132


#### This model performs very poorly on this text data. This is a clear case where iteration would be necessary to figure out what subset of text improves the model, and if perhaps any of the other features are useful in predicting type.