In [18]:
# libraries for general use
import numpy as np
import pandas as pd

In [19]:
# reading our dataset in a pd.DataFrame object
df = pd.read_csv('originOfMusic.csv')
df.rename(columns={'V117': 'lat', 'V118': 'long'}, inplace=True)
# structure of the dataset:
# print(df.head(2))

In [20]:
# we will also need countries names and codes for visualization
excel = pd.read_excel('countries.xlsx',index_col=1, header=None)
# print(excel.head())

In [21]:
# counting how many track per country are
tracks_per_country = df.groupby('long').describe()['V1']
tracks_per_country = tracks_per_country['count']

new_columns = pd.DataFrame(tracks_per_country.sort_values(ascending=False))

new_columns['country_name'] = [excel.iloc[i][0] for i in range(0, len(excel))]
new_columns['country_label'] = np.arange(0, 33)
new_columns['country_code'] = [excel.iloc[i][2] for i in range(0, len(excel))]

print(new_columns.head(2))

       count country_name  country_label country_code
long                                                 
 77.2   69.0        India              0          IND
-8.0    66.0         Mali              1          MLI


In [22]:
# adding to the dataframe 2 new columns:
# labels are used as target for our model
# split_l is used for cross-validation (StratifiedSuffleSplit and StratifiedKFold need a criterion for preserving
# the percentage of sample for each class)
df['labels'] = [tuple([lat, long]) for lat, long in zip(df['lat'], df['long'])]
df['split_l'] = [new_columns.loc[long]['country_label'] for long in df['long'].values]

print(df.head(2))

         V1        V2        V3        V4        V5        V6        V7  \
0  7.161286  7.835325  2.911583  0.984049 -1.499546 -2.094097  0.576000   
1  0.225763 -0.094169 -0.603646  0.497745  0.874036  0.290280 -0.077659   

         V8        V9       V10   ...         V111      V112      V113  \
0 -1.205671  1.849122 -0.425598   ...    -0.364194 -0.364194 -0.364194   
1 -0.887385  0.432062 -0.093963   ...     0.936616  0.936616  0.936616   

       V114      V115      V116    lat   long            labels  split_l  
0 -0.364194 -0.364194 -0.364194 -15.75 -47.95  (-15.75, -47.95)        7  
1  0.936616  0.936616  0.936616  14.91 -23.51   (14.91, -23.51)       10  

[2 rows x 120 columns]


In [23]:
# a priori visualization
import plotly.graph_objects as go

fig = go.Figure(data=go.Choropleth(
    locations = new_columns['country_code'],
    z = new_columns['count'],
    text = new_columns['country_name'],
    colorscale = 'Blues',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
))

fig.show()

In [24]:
# the loss function
from geopy.distance import geodesic
from shapely.geometry import Point

from sklearn.metrics.scorer import make_scorer

def score_function(true_labels, prediction):
    s = []
    for pred, true in zip(prediction, true_labels):
        s.append(geodesic(pred, true))
    return float(str(np.mean(s))[:-3])

score_func  = make_scorer(score_function, greater_is_better=False)

In [25]:
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn import preprocessing

In [26]:
# types of cross-validation 

strKfold = model_selection.StratifiedKFold(n_splits=10, random_state=0)
strShuSplit = model_selection.StratifiedShuffleSplit(n_splits=10, test_size=.20, random_state=0)
kFold = model_selection.KFold(n_splits=10)

In [27]:
normed_data = preprocessing.normalize(df.iloc[:, :-4])
# print (preprocessing.robust_scale(df.iloc[:, :-4]))

In [28]:
# ElasticNet and LassoLars Regression
elNet = linear_model.ElasticNet(max_iter=10000, warm_start=True)


In [29]:
# parameters for GridSearch
paramEN = {'alpha': [0.1, 1, 0.01],
           'l1_ratio': [0.3,  1],
           'tol': [0.01, 0.001, 0.0001, 0.00001]}


In [30]:
# searching for best parameter in the ElasticNet

en = GridSearchCV(elNet, paramEN, cv=strKfold, scoring=score_func)
en.fit(normed_data, list(df.iloc[:]['split_l']))


The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.



GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
             error_score='raise-deprecating',
             estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True,
                                  l1_ratio=0.5, max_iter=10000, normalize=False,
                                  positive=False, precompute=False,
                                  random_state=None, selection='cyclic',
                                  tol=0.0001, warm_start=True),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.1, 1, 0.01], 'l1_ratio': [0.3, 1],
                         'tol': [0.01, 0.001, 0.0001, 1e-05]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(score_function, greater_is_better=False),
             verbose=0)

In [31]:
# print(df.iloc[:, :-4])
# normed_data = preprocessing.normalize(df.iloc[:, :-4])
# print (preprocessing.robust_scale(df.iloc[:, :-4]))
# print(df.iloc[:, :-4])

In [32]:
print(en.best_params_)
# print(ll.best_params_)

{'alpha': 0.01, 'l1_ratio': 1, 'tol': 0.001}


In [33]:

test = linear_model.ElasticNet(alpha = en.best_params_['alpha'], tol=en.best_params_['tol'], 
                               l1_ratio=en.best_params_['l1_ratio'], normalize=True, max_iter=10000, 
                               warm_start=False)

s = []
for train_index, test_index in strKfold.split(df.iloc[:, :-4], df[:]['split_l']):
    s = []
    X_train, y_train = df.iloc[train_index, :-4], list(df.iloc[train_index]['labels'])
    
    test.fit(X_train, y_train)
    pred = pd.DataFrame(test.predict(df.iloc[test_index, :-4]))

    aux = score_function(prediction=pred, true_labels=df.iloc[test_index]['labels'])
    s.append(aux)
    print(aux)
print('mean ', np.mean(s))


4277.813568277068
5505.420391217491
5505.420391217491
5505.420391217491
5505.420391217491
9865.023090461644
9865.023090461644
9874.02750813448
7133.702763406881
5971.555347249576
mean  5971.555347249576


In [34]:
# prediction on the map
fig = go.Figure(
        data=go.Scattergeo(
        lon = pred[1],
        lat = pred[0],
        mode = 'markers',
        marker_color = 'Blue',
        ))

fig.show()