# Clean data

## reading in data

In [1]:
import pandas as pd
import numpy as np
import math

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.externals import joblib

from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier



In [2]:
tweets_raw = pd.read_csv('airline_tweets.csv', encoding = 'unicode_escape')

In [3]:
tweets_raw.shape

(14640, 20)

In [4]:
cities_raw = pd.read_csv('cities.csv')

In [5]:
cities_raw.shape

(23278, 19)

In [6]:
tweets_raw.dtypes

_unit_id                          int64
_golden                            bool
_unit_state                      object
_trusted_judgments                int64
_last_judgment_at                object
airline_sentiment                object
airline_sentiment:confidence    float64
negativereason                   object
negativereason:confidence       float64
airline                          object
airline_sentiment_gold           object
name                             object
negativereason_gold              object
retweet_count                     int64
text                             object
tweet_coord                      object
tweet_created                    object
tweet_id                        float64
tweet_location                   object
user_timezone                    object
dtype: object

In [7]:
tweets_raw

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/15 5:24,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/15 11:35,5.703060e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/15 1:53,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/15 11:15,5.703010e+17,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2/25/15 10:01,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/15 11:15,5.703010e+17,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2/25/15 3:05,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/15 11:15,5.703010e+17,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2/25/15 5:50,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/15 11:14,5.703010e+17,,Pacific Time (US & Canada)
5,681448162,False,finalized,3,2/25/15 9:10,negative,1.0000,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2/24/15 11:14,5.703010e+17,,Pacific Time (US & Canada)
6,681448165,False,finalized,3,2/25/15 8:11,positive,0.6745,,0.0000,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...",,2/24/15 11:13,5.703010e+17,San Francisco CA,Pacific Time (US & Canada)
7,681448167,False,finalized,3,2/25/15 2:11,neutral,0.6340,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,,2/24/15 11:12,5.703000e+17,Los Angeles,Pacific Time (US & Canada)
8,681448169,False,finalized,3,2/25/15 9:01,positive,0.6559,,,Virgin America,,dhepburn,,0,"@virginamerica Well, I didn'tÛ_but NOW I DO! :-D",,2/24/15 11:11,5.703000e+17,San Diego,Pacific Time (US & Canada)
9,681448171,False,finalized,3,2/25/15 4:15,positive,1.0000,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...",,2/24/15 10:53,5.702950e+17,Los Angeles,Eastern Time (US & Canada)


In [8]:
tweets_df = tweets_raw[['airline_sentiment','tweet_coord']]

In [9]:
cities_raw.dtypes

geonameid              int64
name                  object
asciiname             object
alternatenames        object
latitude             float64
longitude            float64
feature class         object
feature code          object
country code          object
cc2                   object
admin1 code           object
admin2 code           object
admin3 code           object
admin4 code           object
population             int64
elevation            float64
dem                    int64
timezone              object
modification date     object
dtype: object

In [10]:
cities_raw

Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,admin1 code,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date
0,3040051,les Escaldes,les Escaldes,"Ehskal'des-Ehndzhordani,Escaldes,Escaldes-Engo...",42.50729,1.53414,P,PPLA,AD,,08,,,,15853,,1033,Europe/Andorra,2008-10-15
1,3041563,Andorra la Vella,Andorra la Vella,"ALV,Ando-la-Vyey,Andora,Andora la Vela,Andora ...",42.50779,1.52109,P,PPLC,AD,,07,,,,20430,,1037,Europe/Andorra,2010-05-30
2,290594,Umm al Qaywayn,Umm al Qaywayn,"Oumm al Qaiwain,Oumm al Qaïwaïn,Um al Kawain,U...",25.56473,55.55517,P,PPLA,AE,,07,,,,44411,,2,Asia/Dubai,2014-10-07
3,291074,Ras al-Khaimah,Ras al-Khaimah,"Julfa,Khaimah,RKT,Ra's al Khaymah,Ra's al-Chai...",25.78953,55.94320,P,PPLA,AE,,05,,,,115949,,2,Asia/Dubai,2015-12-05
4,291696,Khawr Fakkān,Khawr Fakkan,"Fakkan,Fakkān,Khawr Fakkan,Khawr Fakkān,Khawr ...",25.33132,56.34199,P,PPL,AE,,06,,,,33575,,20,Asia/Dubai,2013-10-25
5,292223,Dubai,Dubai,"DXB,Dabei,Dibai,Dibay,Doubayi,Dubae,Dubai,Duba...",25.06570,55.17128,P,PPLA,AE,,03,,,,1137347,,3,Asia/Dubai,2014-12-02
6,292231,Dibba Al-Fujairah,Dibba Al-Fujairah,"Al-Fujairah,BYB,Dibba Al-Fujairah,dba alfjyrt,...",25.59246,56.26176,P,PPL,AE,,04,,,,30000,,16,Asia/Dubai,2014-08-12
7,292239,Dibba Al-Hisn,Dibba Al-Hisn,"BYB,Daba,Daba al-Hisn,Dabā,Dabā al-Ḥiṣn,Diba,D...",25.61955,56.27291,P,PPL,AE,,04,,,,26395,,4,Asia/Dubai,2014-04-21
8,292672,Sharjah,Sharjah,"Al Sharjah,Ash 'Mariqah,Ash Shariqa,Ash Shariq...",25.33737,55.41206,P,PPLA,AE,,06,,,,543733,,6,Asia/Dubai,2013-03-05
9,292688,Ar Ruways,Ar Ruways,"Ar Ru'ays,Ar Ruways,Ar Ru’ays,Ar-Ruvais,Ruwais...",24.11028,52.73056,P,PPL,AE,AE,01,,,,16000,,16,Asia/Dubai,2012-11-03


In [11]:
cities_df = cities_raw[['name','latitude','longitude']]

## cleaning

In [12]:
# checking for missing values in tweets
tweets_df.isnull().sum()

airline_sentiment        0
tweet_coord          13621
dtype: int64

In [13]:
# dropping NA
tweets_df = tweets_df[tweets_df['tweet_coord'].notnull()]

In [14]:
# dropping '[0.0, 0.0]'
tweets_df = tweets_df.loc[tweets_df['tweet_coord'] != '[0.0, 0.0]']

In [15]:
tweets_df.head()

Unnamed: 0,airline_sentiment,tweet_coord
21,positive,"[40.74804263, -73.99295302]"
28,negative,"[42.361016, -71.02000488]"
29,neutral,"[33.94540417, -118.4062472]"
32,negative,"[33.94209449, -118.40410103]"
34,positive,"[33.2145038, -96.9321504]"


In [16]:
# checking for missing values in cities
cities_df.isnull().sum()

name         0
latitude     0
longitude    0
dtype: int64

In [17]:
cities_df.head()

Unnamed: 0,name,latitude,longitude
0,les Escaldes,42.50729,1.53414
1,Andorra la Vella,42.50779,1.52109
2,Umm al Qaywayn,25.56473,55.55517
3,Ras al-Khaimah,25.78953,55.9432
4,Khawr Fakkān,25.33132,56.34199


# Prepare training data

## encoding classes

In [18]:
class_dict = {'negative':0,'neutral':1,'positive':2}
tweets_df = tweets_df.replace({'airline_sentiment':class_dict})

## finding closest city

In [19]:
cities_df.head()

Unnamed: 0,name,latitude,longitude
0,les Escaldes,42.50729,1.53414
1,Andorra la Vella,42.50779,1.52109
2,Umm al Qaywayn,25.56473,55.55517
3,Ras al-Khaimah,25.78953,55.9432
4,Khawr Fakkān,25.33132,56.34199


In [20]:
tweets_df.head()

Unnamed: 0,airline_sentiment,tweet_coord
21,2,"[40.74804263, -73.99295302]"
28,0,"[42.361016, -71.02000488]"
29,1,"[33.94540417, -118.4062472]"
32,0,"[33.94209449, -118.40410103]"
34,2,"[33.2145038, -96.9321504]"


In [21]:
def get_lat_long(string):
    lat = float(string.strip('[]').split(',')[0])
    long = float(string.strip('[]').split(',')[1])
    return lat, long

In [22]:
tweets_df['lat'] = tweets_df['tweet_coord'].apply(lambda x: get_lat_long(x)[0])

In [23]:
tweets_df['long'] = tweets_df['tweet_coord'].apply(lambda x: get_lat_long(x)[1])

In [24]:
tweets_df

Unnamed: 0,airline_sentiment,tweet_coord,lat,long
21,2,"[40.74804263, -73.99295302]",40.748043,-73.992953
28,0,"[42.361016, -71.02000488]",42.361016,-71.020005
29,1,"[33.94540417, -118.4062472]",33.945404,-118.406247
32,0,"[33.94209449, -118.40410103]",33.942094,-118.404101
34,2,"[33.2145038, -96.9321504]",33.214504,-96.932150
42,1,"[34.0219817, -118.38591198]",34.021982,-118.385912
62,1,"[33.57963333, -117.73024772]",33.579633,-117.730248
69,0,"[40.6413712, -73.78311558]",40.641371,-73.783116
74,2,"[36.08457854, -115.13780136]",36.084579,-115.137801
108,1,"[37.79374402, -122.39327564]",37.793744,-122.393276


### getting the nearest city

#### method 1: great circle distance

In [25]:
# calculating the great circle distance between two points 
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    return math.asin(math.sqrt(a)) # omitted the factor of radius*2 because we only want a rank

In [26]:
def closest_city_haversine(long, lat):
    dist_list = []
    for i, row_city in cities_df.iterrows():
        dist_list.append(haversine(long, lat, row_city['longitude'], row_city['latitude']))
    return cities_df.loc[np.argmin(dist_list)]['name']   

In [27]:
tweets_df['city_haversine'] = tweets_df.apply(lambda x: closest_city_haversine(x['long'], x['lat']), axis=1)

In [28]:
tweets_df

Unnamed: 0,airline_sentiment,tweet_coord,lat,long,city_haversine
21,2,"[40.74804263, -73.99295302]",40.748043,-73.992953,Hoboken
28,0,"[42.361016, -71.02000488]",42.361016,-71.020005,Boston
29,1,"[33.94540417, -118.4062472]",33.945404,-118.406247,El Segundo
32,0,"[33.94209449, -118.40410103]",33.942094,-118.404101,El Segundo
34,2,"[33.2145038, -96.9321504]",33.214504,-96.932150,Frisco
42,1,"[34.0219817, -118.38591198]",34.021982,-118.385912,Culver City
62,1,"[33.57963333, -117.73024772]",33.579633,-117.730248,Aliso Viejo
69,0,"[40.6413712, -73.78311558]",40.641371,-73.783116,Springfield Gardens
74,2,"[36.08457854, -115.13780136]",36.084579,-115.137801,Paradise
108,1,"[37.79374402, -122.39327564]",37.793744,-122.393276,San Francisco


#### method 2: 3-d Euclidean distance

In [29]:
def get_3d_coords(lon, lat):
    x = math.cos(lat)*math.cos(lon) # radius is ignored because we only want a rank
    y = math.cos(lat)*math.sin(lon)
    z = math.sin(lat)
    return x, y, z

In [30]:
def euclidean_3d(lon1, lat1, lon2, lat2):
    """
    Calculate the 3d Euclidean distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])

    x1, y1, z1 = get_3d_coords(lon1, lat1)
    x2, y2, z2 = get_3d_coords(lon2, lat2)
    
    return (x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2 # sqrt is omitted beause we only want a rank

In [31]:
def closest_city_euclidean_3d(long, lat):
    dist_list = []
    for i, row_city in cities_df.iterrows():
        dist_list.append(euclidean_3d(long, lat, row_city['longitude'], row_city['latitude']))
    return cities_df.loc[np.argmin(dist_list)]['name']   

In [32]:
tweets_df['city_euclidean_3d'] = tweets_df.apply(lambda x: closest_city_euclidean_3d(x['long'], x['lat']), axis=1)

In [33]:
tweets_df

Unnamed: 0,airline_sentiment,tweet_coord,lat,long,city_haversine,city_euclidean_3d
21,2,"[40.74804263, -73.99295302]",40.748043,-73.992953,Hoboken,Hoboken
28,0,"[42.361016, -71.02000488]",42.361016,-71.020005,Boston,Boston
29,1,"[33.94540417, -118.4062472]",33.945404,-118.406247,El Segundo,El Segundo
32,0,"[33.94209449, -118.40410103]",33.942094,-118.404101,El Segundo,El Segundo
34,2,"[33.2145038, -96.9321504]",33.214504,-96.932150,Frisco,Frisco
42,1,"[34.0219817, -118.38591198]",34.021982,-118.385912,Culver City,Culver City
62,1,"[33.57963333, -117.73024772]",33.579633,-117.730248,Aliso Viejo,Aliso Viejo
69,0,"[40.6413712, -73.78311558]",40.641371,-73.783116,Springfield Gardens,Springfield Gardens
74,2,"[36.08457854, -115.13780136]",36.084579,-115.137801,Paradise,Paradise
108,1,"[37.79374402, -122.39327564]",37.793744,-122.393276,San Francisco,San Francisco


##### a much faster way..

In [34]:
tweets_df['lat_rad'] = tweets_df['lat'].apply(math.radians)
tweets_df['lon_rad'] = tweets_df['long'].apply(math.radians)
cities_df['lat_rad'] = cities_df['latitude'].apply(math.radians)
cities_df['lon_rad'] = cities_df['longitude'].apply(math.radians)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [35]:
tweets_df['x'] = pd.DataFrame(tweets_df.apply(lambda x: get_3d_coords(x['lon_rad'], x['lat_rad'])[0], axis=1))
tweets_df['y'] = pd.DataFrame(tweets_df.apply(lambda x: get_3d_coords(x['lon_rad'], x['lat_rad'])[1], axis=1))
tweets_df['z'] = pd.DataFrame(tweets_df.apply(lambda x: get_3d_coords(x['lon_rad'], x['lat_rad'])[2], axis=1))

In [36]:
cities_df['x'] = pd.DataFrame(cities_df.apply(lambda x: get_3d_coords(x['lon_rad'], x['lat_rad'])[0], axis=1))
cities_df['y'] = pd.DataFrame(cities_df.apply(lambda x: get_3d_coords(x['lon_rad'], x['lat_rad'])[1], axis=1))
cities_df['z'] = pd.DataFrame(cities_df.apply(lambda x: get_3d_coords(x['lon_rad'], x['lat_rad'])[2], axis=1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
knn = KNeighborsClassifier(n_neighbors=1)

In [38]:
knn.fit(cities_df[['x','y','z']], cities_df['name'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [39]:
tweets_df['city_euclidean_3d_knn'] = knn.predict(tweets_df[['x','y','z']])

In [40]:
tweets_df

Unnamed: 0,airline_sentiment,tweet_coord,lat,long,city_haversine,city_euclidean_3d,lat_rad,lon_rad,x,y,z,city_euclidean_3d_knn
21,2,"[40.74804263, -73.99295302]",40.748043,-73.992953,Hoboken,Hoboken,0.711188,-1.291421,0.208909,-0.728214,0.652734,Hoboken
28,0,"[42.361016, -71.02000488]",42.361016,-71.020005,Boston,Boston,0.739339,-1.239533,0.240323,-0.698741,0.673800,Boston
29,1,"[33.94540417, -118.4062472]",33.945404,-118.406247,El Segundo,El Segundo,0.592459,-2.066579,-0.394643,-0.729687,0.558403,El Segundo
32,0,"[33.94209449, -118.40410103]",33.942094,-118.404101,El Segundo,El Segundo,0.592401,-2.066541,-0.394631,-0.729730,0.558355,El Segundo
34,2,"[33.2145038, -96.9321504]",33.214504,-96.932150,Frisco,Frisco,0.579702,-1.691785,-0.100976,-0.830510,0.547775,Frisco
42,1,"[34.0219817, -118.38591198]",34.021982,-118.385912,Culver City,Culver City,0.593796,-2.066224,-0.394029,-0.729170,0.559511,Culver City
62,1,"[33.57963333, -117.73024772]",33.579633,-117.730248,Aliso Viejo,Aliso Viejo,0.586075,-2.054780,-0.387658,-0.737433,0.553095,Aliso Viejo
69,0,"[40.6413712, -73.78311558]",40.641371,-73.783116,Springfield Gardens,Springfield Gardens,0.709326,-1.287758,0.211914,-0.728610,0.651322,Springfield Gardens
74,2,"[36.08457854, -115.13780136]",36.084579,-115.137801,Paradise,Paradise,0.629795,-2.009534,-0.343299,-0.731608,0.588979,Paradise
108,1,"[37.79374402, -122.39327564]",37.793744,-122.393276,San Francisco,San Francisco,0.659625,-2.136166,-0.423344,-0.667256,0.612821,San Francisco


In [41]:
cities_counts = pd.DataFrame(tweets_df['city_euclidean_3d_knn'].value_counts())

In [42]:
cities_counts.shape

(293, 1)

In [43]:
cities_counts[cities_counts['city_euclidean_3d_knn'] == 1].shape

(161, 1)

In [44]:
# half of the cities just appeared once in the data set
161/293

0.5494880546075085

#### method 3: Euclidean on latitude and longitude

In [45]:
X = cities_df[['latitude','longitude']]
y = cities_df['name']

In [46]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [47]:
tweets_df['city_euclidean'] = knn.predict(tweets_df[['lat','long']])

In [48]:
tweets_df

Unnamed: 0,airline_sentiment,tweet_coord,lat,long,city_haversine,city_euclidean_3d,lat_rad,lon_rad,x,y,z,city_euclidean_3d_knn,city_euclidean
21,2,"[40.74804263, -73.99295302]",40.748043,-73.992953,Hoboken,Hoboken,0.711188,-1.291421,0.208909,-0.728214,0.652734,Hoboken,New York City
28,0,"[42.361016, -71.02000488]",42.361016,-71.020005,Boston,Boston,0.739339,-1.239533,0.240323,-0.698741,0.673800,Boston,Chelsea
29,1,"[33.94540417, -118.4062472]",33.945404,-118.406247,El Segundo,El Segundo,0.592459,-2.066579,-0.394643,-0.729687,0.558403,El Segundo,El Segundo
32,0,"[33.94209449, -118.40410103]",33.942094,-118.404101,El Segundo,El Segundo,0.592401,-2.066541,-0.394631,-0.729730,0.558355,El Segundo,El Segundo
34,2,"[33.2145038, -96.9321504]",33.214504,-96.932150,Frisco,Frisco,0.579702,-1.691785,-0.100976,-0.830510,0.547775,Frisco,Frisco
42,1,"[34.0219817, -118.38591198]",34.021982,-118.385912,Culver City,Culver City,0.593796,-2.066224,-0.394029,-0.729170,0.559511,Culver City,Culver City
62,1,"[33.57963333, -117.73024772]",33.579633,-117.730248,Aliso Viejo,Aliso Viejo,0.586075,-2.054780,-0.387658,-0.737433,0.553095,Aliso Viejo,Aliso Viejo
69,0,"[40.6413712, -73.78311558]",40.641371,-73.783116,Springfield Gardens,Springfield Gardens,0.709326,-1.287758,0.211914,-0.728610,0.651322,Springfield Gardens,Springfield Gardens
74,2,"[36.08457854, -115.13780136]",36.084579,-115.137801,Paradise,Paradise,0.629795,-2.009534,-0.343299,-0.731608,0.588979,Paradise,Paradise
108,1,"[37.79374402, -122.39327564]",37.793744,-122.393276,San Francisco,San Francisco,0.659625,-2.136166,-0.423344,-0.667256,0.612821,San Francisco,San Francisco


## one-hot encoding

In [49]:
lb = preprocessing.LabelBinarizer()
lb.fit(tweets_df['city_euclidean_3d_knn'])

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [50]:
tweets_df_oh = pd.DataFrame(lb.transform(tweets_df['city_euclidean_3d_knn']), columns=lb.classes_)

In [51]:
# checking row 6 for accuracy
tweets_df_oh

Unnamed: 0,Addison,Aldine,Aliso Viejo,Allen,American Fork,Anaheim,Annapolis,Arbutus,Ashford,Ashland,...,Willow Grove,Willowdale,Windsor,Winnipeg,Winter Park,Winthrop,Wolverhampton,Woodstock,Xiuying,Zionsville
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
tweets_df_oh.shape

(855, 293)

In [53]:
tweets_df.shape

(855, 13)

## train-test split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(tweets_df_oh, tweets_df['airline_sentiment'], test_size=0.2, random_state=42)

In [55]:
X_train.head()

Unnamed: 0,Addison,Aldine,Aliso Viejo,Allen,American Fork,Anaheim,Annapolis,Arbutus,Ashford,Ashland,...,Willow Grove,Willowdale,Windsor,Winnipeg,Winter Park,Winthrop,Wolverhampton,Woodstock,Xiuying,Zionsville
788,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
790,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
530,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
430,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
y_train.head()

13452    0
1461     0
13493    0
8342     0
6487     0
Name: airline_sentiment, dtype: int64

# Train model

## checking class balance

In [57]:
y_train.value_counts()

0    467
2    115
1    102
Name: airline_sentiment, dtype: int64

## baseline model (no model)

In [58]:
print(f'Training Accuracy: {accuracy_score(y_train, np.zeros(len(y_train)))}')
print(f'Test Accuracy: {accuracy_score(y_test, np.zeros(len(y_test)))}')
print(f'Test F1 Score: {f1_score(y_test, np.zeros(len(y_test)), average="macro")}')
print("\n----------- Baseline Classification Report -----------\n")
print(classification_report(y_test, np.zeros(len(y_test))))

Training Accuracy: 0.6827485380116959
Test Accuracy: 0.6198830409356725
Test F1 Score: 0.25511432009626955

----------- Baseline Classification Report -----------

              precision    recall  f1-score   support

           0       0.62      1.00      0.77       106
           1       0.00      0.00      0.00        30
           2       0.00      0.00      0.00        35

    accuracy                           0.62       171
   macro avg       0.21      0.33      0.26       171
weighted avg       0.38      0.62      0.47       171



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## random forest

### simple random forest classifier

In [59]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [60]:
print(f'Training accuracy: {rfc.score(X_train, y_train)}')
print(f'Test accuracy: {rfc.score(X_test, y_test)}')
print(f'Test F1 Score: {f1_score(y_test, rfc.predict(X_test), average="macro")}')
print("\n----------- Baseline Classification Report -----------\n")
print(classification_report(y_test, rfc.predict(X_test)))

Training accuracy: 0.7763157894736842
Test accuracy: 0.6023391812865497
Test F1 Score: 0.34627892432770485

----------- Baseline Classification Report -----------

              precision    recall  f1-score   support

           0       0.63      0.92      0.75       106
           1       0.36      0.13      0.20        30
           2       0.33      0.06      0.10        35

    accuracy                           0.60       171
   macro avg       0.44      0.37      0.35       171
weighted avg       0.52      0.60      0.52       171



### weighting to handle imbalance

In [61]:
rfc_wt = RandomForestClassifier(class_weight='balanced')
rfc_wt.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [62]:
print(f'Training accuracy: {rfc_wt.score(X_train, y_train)}')
print(f'Test accuracy: {rfc_wt.score(X_test, y_test)}')
print(f'Test F1 Score: {f1_score(y_test, rfc_wt.predict(X_test), average="macro")}')
print("\n----------- Baseline Classification Report -----------\n")
print(classification_report(y_test, rfc_wt.predict(X_test)))

Training accuracy: 0.6681286549707602
Test accuracy: 0.4619883040935672
Test F1 Score: 0.32545003290275815

----------- Baseline Classification Report -----------

              precision    recall  f1-score   support

           0       0.59      0.64      0.61       106
           1       0.14      0.13      0.14        30
           2       0.26      0.20      0.23        35

    accuracy                           0.46       171
   macro avg       0.33      0.32      0.33       171
weighted avg       0.44      0.46      0.45       171



### without train-test split

In [63]:
rfc_wt_no_split = RandomForestClassifier(class_weight='balanced')
rfc_wt_no_split.fit(tweets_df_oh, tweets_df['airline_sentiment'])



RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [64]:
print(f'Training accuracy: {rfc_wt_no_split.score(tweets_df_oh, tweets_df["airline_sentiment"])}')
# print(f'Test accuracy: {rfc_wt.score(X_test, y_test)}')
print(f'Training F1 Score: {f1_score(tweets_df["airline_sentiment"], rfc_wt_no_split.predict(tweets_df_oh), average="macro")}')
print("\n----------- Baseline Classification Report -----------\n")
print(classification_report(tweets_df["airline_sentiment"], rfc_wt_no_split.predict(tweets_df_oh)))

Training accuracy: 0.656140350877193
Training F1 Score: 0.5932622121160803

----------- Baseline Classification Report -----------

              precision    recall  f1-score   support

           0       0.84      0.68      0.75       573
           1       0.44      0.64      0.52       132
           2       0.45      0.60      0.52       150

    accuracy                           0.66       855
   macro avg       0.57      0.64      0.59       855
weighted avg       0.71      0.66      0.67       855



### tuning hyperparams using grid-search

In [65]:
params = {'n_estimators':  [10, 50, 100, 200],
          'max_depth': [None, 5, 10],
          'max_features': ['auto','log2'],
          'class_weight': ['balanced', 'balanced_subsample']}
rfc_gs = GridSearchCV(RandomForestClassifier(), params, scoring='f1_macro', cv=5, verbose=1, n_jobs=-1)
rfc_gs.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   19.6s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [66]:
gs_df = pd.DataFrame(rfc_gs.cv_results_)
gs_df.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.113046,0.004468,0.004179,0.000177,balanced,,log2,10,"{'class_weight': 'balanced', 'max_depth': None...",0.31396,0.298517,0.404052,0.335384,0.361054,0.342381,0.037235,1
38,0.265924,0.010918,0.012471,0.000687,balanced_subsample,5.0,log2,100,"{'class_weight': 'balanced_subsample', 'max_de...",0.329995,0.355629,0.37147,0.350064,0.304571,0.342348,0.023035,2
26,0.470725,0.014271,0.017834,0.002083,balanced_subsample,,auto,100,"{'class_weight': 'balanced_subsample', 'max_de...",0.322154,0.355778,0.36285,0.347791,0.319963,0.341691,0.017531,3
27,0.937179,0.062143,0.037347,0.007733,balanced_subsample,,auto,200,"{'class_weight': 'balanced_subsample', 'max_de...",0.335564,0.341799,0.352519,0.345992,0.332626,0.341682,0.007135,4
44,0.113736,0.008189,0.003727,0.000436,balanced_subsample,10.0,log2,10,"{'class_weight': 'balanced_subsample', 'max_de...",0.326921,0.327112,0.348607,0.324604,0.375201,0.34041,0.019393,5


### best random forest classifier

In [67]:
rfc_best = rfc_gs.best_estimator_
rfc_best.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='log2',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [68]:
print(f'Training accuracy: {rfc_best.score(X_train, y_train)}')
print(f'Test accuracy: {rfc_best.score(X_test, y_test)}')
print(f'Test F1 Score: {f1_score(y_test, rfc_best.predict(X_test), average="macro")}')
print("\n----------- Baseline Classification Report -----------\n")
print(classification_report(y_test, rfc_best.predict(X_test)))

Training accuracy: 0.6885964912280702
Test accuracy: 0.5029239766081871
Test F1 Score: 0.38628950050968397

----------- Baseline Classification Report -----------

              precision    recall  f1-score   support

           0       0.62      0.66      0.64       106
           1       0.27      0.27      0.27        30
           2       0.28      0.23      0.25        35

    accuracy                           0.50       171
   macro avg       0.39      0.39      0.39       171
weighted avg       0.49      0.50      0.50       171



## Naive Bayes

In [69]:
nbc = MultinomialNB()
nbc.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [70]:
print(f'Training accuracy: {nbc.score(X_train, y_train)}')
print(f'Test accuracy: {nbc.score(X_test, y_test)}')
print(f'Test F1 Score: {f1_score(y_test, nbc.predict(X_test), average="macro")}')
print("\n----------- Baseline Classification Report -----------\n")
print(classification_report(y_test, nbc.predict(X_test)))

Training accuracy: 0.7017543859649122
Test accuracy: 0.6198830409356725
Test F1 Score: 0.2725634725634726

----------- Baseline Classification Report -----------

              precision    recall  f1-score   support

           0       0.62      0.99      0.76       106
           1       0.00      0.00      0.00        30
           2       0.50      0.03      0.05        35

    accuracy                           0.62       171
   macro avg       0.37      0.34      0.27       171
weighted avg       0.49      0.62      0.48       171



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## xgboost

### handling imbalance first

In [71]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [72]:
X_train_res.shape

(1401, 293)

In [73]:
X_train_res = pd.DataFrame(X_train_res, columns=X_test.columns)

In [74]:
# checking balance
unique, counts = np.unique(y_train_res, return_counts=True)
dict(zip(unique, counts))

{0: 467, 1: 467, 2: 467}

### simple xgboost

In [75]:
xgbc = XGBClassifier()
xgbc.fit(X_train_res, y_train_res)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [76]:
print(f'Training accuracy: {xgbc.score(X_train_res, y_train_res)}')
print(f'Test accuracy: {xgbc.score(X_test, y_test)}')
print(f'Test F1 Score: {f1_score(y_test, xgbc.predict(X_test), average="macro")}')
print("\n----------- Baseline Classification Report -----------\n")
print(classification_report(y_test, xgbc.predict(X_test)))

Training accuracy: 0.5139186295503212
Test accuracy: 0.3391812865497076
Test F1 Score: 0.2912656047905903

----------- Baseline Classification Report -----------

              precision    recall  f1-score   support

           0       0.78      0.29      0.42       106
           1       0.21      0.80      0.34        30
           2       0.16      0.09      0.11        35

    accuracy                           0.34       171
   macro avg       0.38      0.39      0.29       171
weighted avg       0.55      0.34      0.35       171



### grid search

In [77]:
params = {'min_child_weight': [1, 5, 10],
          'gamma': [0.5, 1, 1.5, 2, 5],
          'subsample': [0.6, 0.8, 1.0],
          'colsample_bytree': [0.6, 0.8, 1.0],
          'max_depth': [3, 4, 5]}
xgbc_gs = GridSearchCV(XGBClassifier(), params, scoring='f1_macro', cv=5, verbose=1, n_jobs=-1)
xgbc_gs.fit(X_train_res, y_train_res)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 20.9min
[Parallel(n_jobs=-1)]: Done 2025 out of 2025 | elapsed: 24.5min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'colsample_bytree': [0.6, 0.8, 1.0],
                         'gamma': [0.5, 1, 1.5, 2, 5], 'max_depth': [3, 4, 5],
                         'mi

In [78]:
xgbc_best = xgbc_gs.best_estimator_
xgbc_best.fit(X_train_res, y_train_res)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0.5,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.6, verbosity=1)

In [79]:
print(f'Training accuracy: {xgbc_best.score(X_train_res, y_train_res)}')
print(f'Test accuracy: {xgbc_best.score(X_test, y_test)}')
print(f'Test F1 Score: {f1_score(y_test, xgbc_best.predict(X_test), average="macro")}')
print("\n----------- Baseline Classification Report -----------\n")
print(classification_report(y_test, xgbc_best.predict(X_test)))

Training accuracy: 0.5267665952890792
Test accuracy: 0.3391812865497076
Test F1 Score: 0.2801330070895288

----------- Baseline Classification Report -----------

              precision    recall  f1-score   support

           0       0.71      0.32      0.44       106
           1       0.20      0.73      0.32        30
           2       0.13      0.06      0.08        35

    accuracy                           0.34       171
   macro avg       0.35      0.37      0.28       171
weighted avg       0.50      0.34      0.35       171



Conclusion: The best model is the random forest classifier.

## saving model

In [80]:
joblib.dump(rfc_best, 'rfc_best.pkl')

['rfc_best.pkl']

## loading model

In [81]:
rfc_pkl = joblib.load('rfc_best.pkl')

In [82]:
# checking if the loaded model is the same as the original
assert rfc_pkl.score(X_train, y_train) == rfc_best.score(X_train, y_train)

# Scoring cities

In [83]:
sentiment_score = pd.DataFrame(rfc_pkl.predict_proba(tweets_df_oh), columns=['negative','neutral','positive'])

In [84]:
sentiment_score['city'] = tweets_df['city_euclidean_3d_knn'].values

In [85]:
sentiment_score = sentiment_score[['city','negative','neutral','positive']]

In [86]:
city_rank = sentiment_score.sort_values('positive', ascending=False).drop_duplicates()

In [87]:
city_rank = city_rank.reset_index(drop=True)

## cities ranked

In [88]:
city_rank.head()

Unnamed: 0,city,negative,neutral,positive
0,Meadow Woods,0.0,0.0,1.0
1,Rio Linda,0.1,0.0,0.9
2,Euless,0.1,0.0,0.9
3,Polanco,0.117926,0.082074,0.8
4,Funza,0.2,0.0,0.8


## output to file

In [89]:
city_rank.to_csv('city_rank.csv')