Mean Shift

madhug-nadig · Jun 1, 2017 · 162f64c · 162f64c
1 parent 4086f51
commit 162f64c
Showing 1 changed file with 60 additions and 2 deletions.
diff --git a/Mean Shift.py b/Mean Shift.py
@@ -10,6 +10,7 @@
 from matplotlib import style
 import pandas
 import datetime
+from sklearn import preprocessing, cross_validation
 
 #for plotting
 plt.style.use('ggplot')
@@ -21,12 +22,69 @@ def __init__(self):
 
 
 def main():
+	'''
+	Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
+	survival Survival (0 = No; 1 = Yes)
+	name Name
+	sex Sex
+	age Age
+	sibsp Number of Siblings/Spouses Aboard
+	parch Number of Parents/Children Aboard
+	ticket Ticket Number
+	fare Passenger Fare (British pound)
+	cabin Cabin
+	embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
+	boat Lifeboat
+	body Body Identification Number
+	home.dest Home/Destination
+	'''
 
-	dataset = { -1 : np.array([[2,3],[4,5],[2,1]]), 1: np.array([[5,6], [8,8], [9,9]]) }
+	df = pd.read_excel('data/titanic.xls')
+
+	original_df = pd.DataFrame.copy(df)
+	df.drop(['body','name'], 1, inplace=True)
+	df.fillna(0,inplace=True)
+
+	def handle_non_numerical_data(df):
+
+		# handling non-numerical data: must convert.
+		columns = df.columns.values
+
+		for column in columns:
+			text_digit_vals = {}
+			def convert_to_int(val):
+				return text_digit_vals[val]
+
+			#print(column,df[column].dtype)
+			if df[column].dtype != np.int64 and df[column].dtype != np.float64:
+
+				column_contents = df[column].values.tolist()
+				#finding just the uniques
+				unique_elements = set(column_contents)
+				# great, found them. 
+				x = 0
+				for unique in unique_elements:
+					if unique not in text_digit_vals:
+						# creating dict that contains new
+						# id per unique string
+						text_digit_vals[unique] = x
+						x+=1
+				# now we map the new "id" vlaue
+				# to replace the string. 
+				df[column] = list(map(convert_to_int,df[column]))
+
+		return df
+
+	df = handle_non_numerical_data(df)
+	df.drop(['ticket','home.dest'], 1, inplace=True)
+
+	X = np.array(df.drop(['survived'], 1).astype(float))
+	X = preprocessing.scale(X)
+	y = np.array(df['survived'])
 	ms = CustomMS()
 
 	ms.fit(dataset = dataset)
-	pred = ms.predict(attrs = [2,2])
+
 
 if __name__ == "__main__":
 	main()