In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.impute import SimpleImputer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [28]:
ufos = pd.read_csv('../input/ufo-sightings/scrubbed.csv')
ufos.head()
#ufos.keys()
#ufos.dtypes

In [38]:
ufos['duration (seconds)'] = ufos['duration (seconds)'].str.strip('`')
ufos['duration (seconds)'] = ufos['duration (seconds)'].astype(float)

In [39]:
# Brute Force typecasting
ufos['latitude'][ufos['latitude'] == '33q.200088'] = '33.200088'
ufos['latitude'] = ufos['latitude'].astype(float)

In [40]:
# Replace missing data with mode
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
ufos_temp = imputer.fit_transform(ufos)
ufos.info()

In [41]:
titles = ['datetime', 'city', 'state', 'country', 'shape', 'duration (seconds)',
       'duration (hours/min)', 'comments', 'date posted', 'latitude',
       'longitude ']
uf = pd.DataFrame(ufos_temp, columns=titles)
uf.head()

In [43]:
# Creating a new dataframe with specific features.
ufos_true = pd.DataFrame({'Seconds': uf['duration (seconds)'], 'Country': uf['country'],'Latitude': uf['latitude'],'Longitude': uf['longitude '], 'Shape': uf['shape']})


In [44]:
ufos_true.Country.unique()

In [46]:
# Being pedantic, but doesn't hurt to check.
ufos_true.dropna(inplace=True)

In [47]:
ufos_true

In [48]:
# Encoding Categorical features
from sklearn.preprocessing import LabelEncoder
ufos_true['Country'] = LabelEncoder().fit_transform(ufos_true['Country'])
ufos_true['Shape'] = LabelEncoder().fit_transform(ufos_true['Shape'])

In [49]:
# Dropping Shape due to mislabelling.
ufos_true[ufos_true['Shape']!= 0]
#ufos_true.drop(['Shape'], axis=1,inplace=True)

In [51]:
ufos_true.describe


In [57]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# getting features and splitting them.
features = ['Seconds', 'Latitude', 'Longitude']

X = ufos_true[features]
y = ufos_true['Country']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [61]:
# instantiating and fitting, and predicting with the model
pipe = Pipeline([('scaler', StandardScaler()),('model', LogisticRegression(max_iter=500))])
#model = LogisticRegression()
pipe.fit(X_train, y_train)
predictions = pipe.predict(X_test)

In [62]:
print(classification_report(y_test, predictions))
print('Predicted labels: ', predictions)
print('Accuracy: ', accuracy_score(y_test, predictions))

In [65]:
import pickle
model_filename = 'ufo-model.pkl'
pickle.dump(model, open(model_filename,'wb'))


model = pickle.load(open('ufo-model.pkl','rb'))
print(model.predict([[50,44,-12]]))