### This is a sample of how to do some of the basic things that you would need to do for this task

#### Problem Statement: Create a model that can help you determine features and characteristics of a person based only on their handwriting

In [15]:
# Import the basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [16]:
# The original dataset exists here: https://docs.google.com/spreadsheets/d/1hD128Qekc2zPABdQHk-N-uE5Mi8v7yxK3YCVigvt8FY/edit?usp=sharing
# The dataset was downloaded as a CSV file

df = pd.read_csv("./Handwriting Personality Dataset Demo - Sheet1.csv", header=1)
df.head()

FileNotFoundError: File b'./Handwriting Personality Dataset Demo - Sheet1.csv' does not exist

In [None]:
# As always we need to vizualize our data and look at what its made up of
# Here we are looking at how many people write with a right slant, left slant or no slant at all
# You will need to do more than just this when investigating your data
pd.value_counts(df['Slant']).plot.bar()

#### Our data seems to exist in textual form. We need it to be in numerical form that the computer can understand

This is how we want to encode our data <br />
loops -> one hot encode <br />
slant -> one hot encode <br />
space between words -> label encode <br />
space between letters -> label encode <br />
letter size -> label encode <br />
personality type -> one hot encode <br />

In [None]:
# let us turn slant into one hot encoded

# first we import the packages we need (we will need this pacakge to do the encoding)
from sklearn import preprocessing

In [None]:
# First we encode the slant using the "get_dummies" method of pandas
# Lets look at what that does
df.drop(['Loops'],axis=1, inplace=True)

In [None]:
# that looks exactly like what we want to have
# so lets add that column to our dataframe (df), and remove the column Slant column that was already there
df = pd.concat([df, pd.get_dummies(df['Slant'], prefix='slant')],axis=1)

df.drop(['Slant'],axis=1, inplace=True)
df.head()

In [None]:
# Now that looks AMAZING!!!

In [None]:
# Lets do the same thing for loops
df = pd.concat([df, pd.get_dummies(df['Loops'], prefix='loop')],axis=1)

df.drop(['Loops'],axis=1, inplace=True)
df.head()

In [None]:
# Now that that is done, we need to start dealing with the other data that can be ordered

In [None]:
# Relevant link: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# Initialise our encoder
label_encoder = preprocessing.LabelEncoder()

# Fit our encoder
label_encoder.fit(df["Space between words"])


# Take a look at what classes it has learnt
label_encoder.classes_

In [None]:
# Lets try it out:
label_encoder.transform(["small", "large", "medium"])

In [None]:
# What this means is that, we can now use label encoder to transform a piece of text into the 
# numerical form

In [None]:
# So now let us format our dataframe so that we have the correct encoding
df["Space between words"] = label_encoder.transform(df["Space between words"])

In [None]:
# lets do a sanity check!
df.head()

In [None]:
# now lets do all of that for the Space between letters and letter size columns

label_encoder_2 = preprocessing.LabelEncoder()
# Fit our encoder
label_encoder_2.fit(df["Space between letters"])
df["Space between letters"] = label_encoder.transform(df["Space between letters"])

label_encoder_3 = preprocessing.LabelEncoder()
# Fit our encoder
label_encoder_3.fit(df["Letter size"])
df["Letter size"] = label_encoder.transform(df["Letter size"])

In [None]:
df.head()

In [None]:
# So far so good ... now lets figure out what we want to do with the personality type
# Since there are only 2 personality types in my example, we can choose to say that 
# introvert = 0 and extrovert = 1
# What this means is that we can only have one column checking if they are extroverts or not.

# Lets add that column
df["extrovert"] = df.apply(lambda row: 1 if row["Personality type"] == "extrovert" else 0 , axis=1)

In [None]:
df.head()

In [None]:
# We dont need the "Personality type" column and can remove it if we want. I will just keep it for now :)

In [None]:
# Now we have the data that we want, and its clean(ish)
# So lets jumpt straight into building our models - In the real world 
# we would want to spend more time with the data and ask better questions to investigate it

from sklearn.neighbors import KNeighborsClassifier

In [None]:
# let us define what our input is (X), and what our output is (y)
X = df[["slant_left", "slant_none", "slant_right", "loop_lower", "loop_upper", "Space between words", "Space between letters"]]
y = df["extrovert"]

In [None]:
# Initialise the model
model = KNeighborsClassifier()

# fit the model
model.fit(X, y)

In [None]:
# NOTE: our model takes inputs in the form ["slant_left", "slant_none", "slant_right", "loop_lower", "loop_upper", "Space between words", "Space between letters"]

# predict with the model
model.predict([[0, 0, 1, 1, 0, 2, 3]])

In [None]:
# what the above line is predicting is that the person is an introvert given that:
# slant_left = 0
# slant_none = 0
# slant_right = 1
# loop_lower = 1
# loop_upper = 0
# space between words is small
# space between letters is medium

# in summary: The persons handwritting has upper loops, slants to the right and uses small font size 
# and medium amount of space between the letters

### NOTE: Several steps have been skipped in order to get to experimenting with encoding data! Keep in mind this is a rough draft to explain the issues we had in class.

In [None]:
# Relevant links:
# http://queirozf.com/entries/one-hot-encoding-a-feature-on-a-pandas-dataframe-an-example
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html