In [1]:
from pandas import Series, DataFrame
import pandas as pd
from patsy import dmatrices
%pylab inline

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [2]:
#Reading the dataset
df=pd.read_csv('C:\\Users\\parth\\OneDrive\\Documents\\UT Austin\\DS Programming\\Project\\NASA\\dataset\\neo_v2.csv')
df.head()

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,54839740.0,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.2658,0.594347,73588.726663,61438130.0,Earth,False,20.0,True
2,2512244,512244 (2015 YE18),0.72203,1.614507,114258.692129,49798720.0,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,25434970.0,Earth,False,22.2,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,46275570.0,Earth,False,20.09,True


In [3]:
No_of_true=len(df[df['hazardous']==True])
No_of_false=len(df[df['hazardous']==False])

print('Baseline Accuracy is',(No_of_false)/(No_of_true+No_of_false))

Baseline Accuracy is 0.90268175613193


In [4]:
#First we check for null values
df.isnull().sum()

id                    0
name                  0
est_diameter_min      0
est_diameter_max      0
relative_velocity     0
miss_distance         0
orbiting_body         0
sentry_object         0
absolute_magnitude    0
hazardous             0
dtype: int64

In [5]:
#So there are no missing values
#Now we convert the target variable to 0,1
def convert_to_int(X):
    return int(X)
df['hazardous']=df['hazardous'].map(convert_to_int)
#Now we check the columns orbiting_body and sentry_object
print(df['orbiting_body'].value_counts())
print(df['sentry_object'].value_counts())

Earth    90836
Name: orbiting_body, dtype: int64
False    90836
Name: sentry_object, dtype: int64


In [6]:
#These two columns clearly have only one value throughout. So we can drop them.
df=df.drop(['sentry_object','orbiting_body'],axis=1)
#Clearly, we can drop the first two columns as well (id and name of asteroid) as they are not useful for
#making predictions
df=df.drop(['id','name'],axis=1)
df.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,54839740.0,16.73,0
1,0.2658,0.594347,73588.726663,61438130.0,20.0,1
2,0.72203,1.614507,114258.692129,49798720.0,17.83,0
3,0.096506,0.215794,24764.303138,25434970.0,22.2,0
4,0.255009,0.570217,42737.733765,46275570.0,20.09,1


In [7]:
#Now we can fit a tree to this dataset.
Y, X = dmatrices('hazardous ~ 0 + est_diameter_min + est_diameter_max + relative_velocity + miss_distance + absolute_magnitude', df, return_type='dataframe')
y = Y['hazardous'].values

In [8]:
#Creating Training and Test Sets (75-25)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=1)

In [9]:
#fitting logistic regression model to training data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
result = model.fit(X_train, y_train)



In [10]:
#Finding Training Accuracy
from sklearn import metrics

prediction_train = model.predict(X_train)
print(metrics.accuracy_score(y_train, prediction_train))



0.9027111130682401


In [11]:
#Training accuracy is 0.9027 which is almost the same as baseline accuracy

#Now, Accuracy on test Set is
prediction = model.predict(X_test)
print(metrics.accuracy_score(y_test, prediction))



0.9025936853229997


In [12]:
# y_train is 0 or 1.
print('Number of positive examples =', len(y_train[y_train==1]))
print('Number of negative examples =', len(y_train[y_train==0]))

negative_examples_in_test = len(y_test[y_test==0])
total_examples_in_test = len(y_test)

print('Number of examples where baseline is correct =', negative_examples_in_test)
print('Baseline accuracy =', negative_examples_in_test * 1.0 / total_examples_in_test)

Number of positive examples = 6628
Number of negative examples = 61499
Number of examples where baseline is correct = 20497
Baseline accuracy = 0.9025936853229997


In [13]:
#Weight of each feature
model.coef_



array([[ 9.43195645e-11,  2.10904958e-10, -1.63593696e-05,
        -2.97688934e-08, -2.25370781e-08]])

In [14]:
model.intercept_

array([-7.97372139e-10])

In [15]:
weights = Series(model.coef_[0],
                 index=X.columns.values)
weights.sort_values()



relative_velocity    -1.635937e-05
miss_distance        -2.976889e-08
absolute_magnitude   -2.253708e-08
est_diameter_min      9.431956e-11
est_diameter_max      2.109050e-10
dtype: float64