In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.cluster import KMeans
import seaborn as sns
from sklearn.model_selection import train_test_split
import re

data = pd.read_csv("Star39552_balanced.csv", sep=",")
data.head()

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType,Amag,TargetClass
0,10.0,31.66,6.19,1.213,K7V,22.502556,1
1,8.26,3.21,1.0,1.13,K0III,15.792525,0
2,8.27,12.75,1.06,0.596,F9V,18.797552,1
3,6.54,5.23,0.76,1.189,K1III,15.132508,0
4,8.52,0.96,0.72,0.173,B8V,13.431356,1


In [2]:
print(data.dtypes)

Vmag           float64
Plx            float64
e_Plx          float64
B-V            float64
SpType          object
Amag           float64
TargetClass      int64
dtype: object


In [3]:
replace= {'K([0-9]([a-z]|[A-Z])*)':'K','F([0-9]([a-z]|[A-Z])*)':'F','B([0-9]([a-z]|[A-Z])*).':'B','O([0-9]([a-z]|[A-Z])*)':'O','A([0-9]([a-z]|[A-Z])*)':'A','G([0-9]([a-z]|[A-Z])*)':'G','M([0-9]([a-z]|[A-Z])*)':'M'}
data_new= data.replace({"SpType": replace}, regex=True)

In [4]:
data_new.head()

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType,Amag,TargetClass
0,10.0,31.66,6.19,1.213,K,22.502556,1
1,8.26,3.21,1.0,1.13,K,15.792525,0
2,8.27,12.75,1.06,0.596,F,18.797552,1
3,6.54,5.23,0.76,1.189,K,15.132508,0
4,8.52,0.96,0.72,0.173,B,13.431356,1


In [5]:
NType = {'K': 0, 'F': 1, 'B': 2, 'O': 3, 'A': 4, 'G': 5, 'M': 6}
data_new['NumSpType'] = data_new['SpType'].map(NType)
data_new.head()

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType,Amag,TargetClass,NumSpType
0,10.0,31.66,6.19,1.213,K,22.502556,1,0.0
1,8.26,3.21,1.0,1.13,K,15.792525,0,0.0
2,8.27,12.75,1.06,0.596,F,18.797552,1,1.0
3,6.54,5.23,0.76,1.189,K,15.132508,0,0.0
4,8.52,0.96,0.72,0.173,B,13.431356,1,2.0


In [6]:
data_new["NumSpType"].value_counts()

0.0    9550
1.0    5605
5.0    5560
4.0    3477
2.0    3381
6.0    1330
3.0      35
Name: NumSpType, dtype: int64

In [15]:
pd.options.display.max_rows = 20000
data_new

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType,Amag,TargetClass,NumSpType
0,10.00,31.66,6.19,1.213,K,22.502556,1,0.0
1,8.26,3.21,1.00,1.130,K,15.792525,0,0.0
2,8.27,12.75,1.06,0.596,F,18.797552,1,1.0
3,6.54,5.23,0.76,1.189,K,15.132508,0,0.0
4,8.52,0.96,0.72,0.173,B,13.431356,1,2.0
...,...,...,...,...,...,...,...,...
39547,5.83,0.17,0.52,0.474,B,6.982245,0,2.0
39548,7.05,18.12,0.92,0.424,F,18.340790,1,1.0
39549,9.21,3.89,1.46,0.227,A,17.159748,1,4.0
39550,9.01,2.13,1.46,1.467,M,15.651898,0,6.0


In [8]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()

In [9]:
X = data_new[['Vmag','Plx','B-V','Amag']]
y = data_new.NumSpType

In [10]:
X

Unnamed: 0,Vmag,Plx,B-V,Amag
0,10.00,31.66,1.213,22.502556
1,8.26,3.21,1.130,15.792525
2,8.27,12.75,0.596,18.797552
3,6.54,5.23,1.189,15.132508
4,8.52,0.96,0.173,13.431356
...,...,...,...,...
39547,5.83,0.17,0.474,6.982245
39548,7.05,18.12,0.424,18.340790
39549,9.21,3.89,0.227,17.159748
39550,9.01,2.13,1.467,15.651898


In [11]:
y

0        0.0
1        0.0
2        1.0
3        0.0
4        2.0
        ... 
39547    2.0
39548    1.0
39549    4.0
39550    6.0
39551    1.0
Name: NumSpType, Length: 39552, dtype: float64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
print(clf.fit(X_train, y_train))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
clf.feature_importances_

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
select = SelectKBest(f_classif, k=2)
select.fit(X, y)

In [None]:
mask = select.get_support()
print(mask)

In [None]:
new_X = data_new[['B-V','Amag']]
new_Y = data_new.SpType
df_new = pd.DataFrame(new_X)
df_new.head()

In [None]:
kmeans = KMeans(n_clusters=3, random_state=10).fit(df_new)
clusters = kmeans.cluster_centers_
clusters

In [None]:
plt.rc('grid', linestyle="-", color='black')
plt.scatter(df_new.iloc[:, 0], df_new.iloc[:, 1], c=new_Y, cmap=plt.cm.Set1, edgecolor='k')
plt.xlabel('B-V')
plt.ylabel('Amag')

plt.plot(clusters[0][0],clusters[0][1],'bo',markersize=12) 
plt.plot(clusters[1][0],clusters[1][1],'bo',markersize=12) 
plt.plot(clusters[2][0],clusters[2][1],'bo',markersize=12)

 


plt.xlabel('B-V')
plt.ylabel('Amag')
plt.grid(True)
plt.show()