In [1]:
# Import dependencies
# numpy for matrix algbera
import numpy as np
# Pandas for data manipulation
import pandas as pd
# matplotlib for data visualization
import matplotlib.pyplot as plt

# Set seed so we get same random allocation on each run of code
np.random.seed(2017)

# Load the IMDB data
educ_data = pd.read_csv("Grade1Students.csv")

# Show structure of data set
educ_data.head()

Unnamed: 0,g1freelunch,g1absent,g1readscore,g1mathscore,g1listeningscore,g1wordscore
0,1,9,516,578,601,493
1,0,12,451,507,584,436
2,1,4,483,526,529,486
3,1,15,516,505,556,536
4,1,2,433,463,504,426


In [2]:
# Summary statistics
educ_data.describe()

Unnamed: 0,g1freelunch,g1absent,g1readscore,g1mathscore,g1listeningscore,g1wordscore
count,5550.0,5550.0,5550.0,5550.0,5550.0,5550.0
mean,0.500901,7.421261,521.307207,531.456216,567.824324,514.643063
std,0.500044,7.004582,55.278448,43.151113,33.562973,52.858396
min,0.0,0.0,404.0,404.0,477.0,317.0
25%,0.0,2.0,478.0,502.0,543.0,475.0
50%,1.0,6.0,516.0,529.0,565.0,514.0
75%,1.0,10.0,558.0,562.0,588.0,551.0
max,1.0,84.0,651.0,676.0,708.0,601.0


In [3]:
# We will normalize each feature to have mean 0 and standard deviation 1
# This standardization is done to represent input data on the same scale

# Standardize free lunch status
educ_data["g1freelunch"] = (educ_data["g1freelunch"]-np.mean(educ_data["g1freelunch"]))/np.std(educ_data["g1freelunch"])

# Standardize absences 
educ_data["g1absent"] = (educ_data["g1absent"]-np.mean(educ_data["g1absent"]))/np.std(educ_data["g1absent"])

# Standardize reading score
educ_data["g1readscore"] = (educ_data["g1readscore"]-np.mean(educ_data["g1readscore"]))/np.std(educ_data["g1readscore"])

# Standardize math score
educ_data["g1mathscore"] = (educ_data["g1mathscore"]-np.mean(educ_data["g1mathscore"]))/np.std(educ_data["g1mathscore"])

# Standardize listening score
educ_data["g1listeningscore"] = (educ_data["g1listeningscore"]-np.mean(educ_data["g1listeningscore"]))/np.std(educ_data["g1listeningscore"])

# Standardized word study score
educ_data["g1wordscore"] = (educ_data["g1wordscore"]-np.mean(educ_data["g1wordscore"]))/np.std(educ_data["g1wordscore"])

In [4]:
# Initialize total number of itterations (remember n = 5550)
total_itter = 3*len(educ_data.index)

# Initialize number of output nodes
nodes_num = 3

# Dimension of input data
input_dim = len(educ_data.columns)

# Initialize parameters for learning rate 
learn_init = 0.1

# Step 1: Initialize the weight vectors 
# Randomly generated matrix with entries between [-2,2], each column is a weight vector 
Weight_mat = 4*np.random.rand(input_dim,nodes_num)-2

# Show initialized weight matrix
print "Initialized weight matrix,", Weight_mat

# Start SOM algorithm itterations
for itter in range(total_itter):
    
    # Initialize distance from weight to chosen point (will be updated in inner loop)
    dist_bmu = float("inf")
    
    # Step 2: Choose data point at random from input data
    
    # Select row index at random
    row_index = np.random.randint(len(educ_data.index))
    
    # Get corresponding data vector
    data_chosen = educ_data.loc[[row_index]]
    
    # Step 3: Find the weight vector that is closest to chosen point
    for node in range(nodes_num):
        
        # Compute euclidean distance from weight vector to chosen point
        dist_neuron = np.linalg.norm(data_chosen-Weight_mat[:,node])
        
        # Save the node with shortest distance of its neuron to chose point
        if dist_neuron < dist_bmu:
            
            # Update distance from weight to chosen point
            dist_bmu = dist_neuron
            
            # Best matching unit (BMU)
            weight_bmu = Weight_mat[:,node]
            index_bmu = node
            
    # Step 4: Define radius of winning neuron neighbourhood 
    # We skip this step because we only have 3 neurons in our application
    
    # Define learning rate
    learn_rate = learn_init*np.exp(-itter/total_itter)
    
    # Step 5: Update weight vectors (w_{t+1} = w_{t} + L(t)*(x_{i} - w_{t}))
    Weight_mat[:,index_bmu] = np.add(weight_bmu,learn_rate*(np.subtract(data_chosen,weight_bmu)))

# Show trained weights
print "Trained weights from SOM,", Weight_mat

Initialized weight matrix, [[-1.9161591   1.06828066 -0.2083208 ]
 [-1.51783354  1.72309184  0.59820164]
 [-1.43731576 -1.07362265 -1.09409884]
 [-0.95978022 -1.548511    0.52761936]
 [-0.45076747 -0.73501752  0.52356238]
 [-0.82139567  1.77945812 -1.39349715]]
Trained weights from SOM, [[-0.67933666  0.9438164  -0.5277563 ]
 [-0.36970586  0.26241107  0.08508913]
 [-0.04492589 -0.81322999  1.42797693]
 [-0.01620479 -0.74816579  1.11809929]
 [ 0.10019266 -0.54810955  1.1153748 ]
 [ 0.01018571 -0.85753327  1.23366498]]


In [5]:
# Initialize vector the classifies each student into group 1,2,3
group = np.zeros(len(educ_data.index))
    
# Classify input data
for index, data in educ_data.iterrows():
    
    # Initialize distance from cluster centroid
    dist_cluster = float("inf")
    
    # Find closest weight centroid
    for centroid in range(nodes_num):
        
        # Compute euclidean distance from centroid vector to data point
        dist_centroid = np.linalg.norm(data-Weight_mat[:,centroid])

        # Save centroid that is closest to data piont
        if dist_centroid < dist_cluster:

                # Update distance from weight to chosen point
                dist_cluster = dist_centroid

                # Best matching unit (BMU)
                group[index] = centroid+1
            
# Add group classifier column 
educ_data["group"] = group

# See labeled data (last column contains labels)
educ_data.head()

Unnamed: 0,g1freelunch,g1absent,g1readscore,g1mathscore,g1listeningscore,g1wordscore,group
0,0.9982,0.225407,-0.096017,1.07872,0.98855,-0.409491,1.0
1,-1.001803,0.653737,-1.271988,-0.566809,0.481993,-1.487941,1.0
2,0.9982,-0.488476,-0.693049,-0.126456,-1.156865,-0.541932,2.0
3,0.9982,1.082066,-0.096017,-0.613161,-0.352334,0.404077,2.0
4,0.9982,-0.774029,-1.597642,-1.586573,-1.9018,-1.677142,2.0


In [6]:
# Last 5 observations
educ_data.tail()

Unnamed: 0,g1freelunch,g1absent,g1readscore,g1mathscore,g1listeningscore,g1wordscore,group
5545,-1.001803,-0.488476,-0.783508,0.012603,-0.590714,-0.636533,1.0
5546,0.9982,0.225407,-0.620681,-0.195985,0.184019,-0.409491,2.0
5547,0.9982,-0.631252,-0.620681,-0.381397,-0.352334,-0.163528,2.0
5548,0.9982,0.368183,-0.530222,-1.215749,-1.09727,-0.636533,2.0
5549,-1.001803,-0.060146,1.04377,1.07872,1.703688,0.404077,3.0


In [7]:
# Let us figure out which group is weak, average, strong

# For group 1:
# Notice the test scores are close to 0 standard deviations away from the mean
# This is likely to be the "average" group
educ_data[educ_data.group == 1].describe()

Unnamed: 0,g1freelunch,g1absent,g1readscore,g1mathscore,g1listeningscore,g1wordscore,group
count,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0
mean,-0.588563,-0.235502,-0.035926,0.024238,0.04875,0.047829,1.0
std,0.809965,0.721381,0.547815,0.622079,0.692355,0.642441,0.0
min,-1.001803,-1.059582,-1.543366,-1.86469,-1.9018,-1.866344,1.0
25%,-1.001803,-0.774029,-0.439763,-0.450926,-0.411929,-0.409491,1.0
50%,-1.001803,-0.345699,-0.005558,0.012603,0.005235,-0.012167,1.0
75%,-1.001803,0.08263,0.338187,0.476132,0.481993,0.404077,1.0
max,0.9982,3.652044,1.496067,2.214367,2.746598,1.633888,1.0


In [8]:
# For group 2:
# Notice that of students recieving free or reduced lunch much more than average, and
# are absent more than average, and have relatively lower test scores. 
#This is likely to be the "weak" group.
educ_data[educ_data.group == 2].describe()

Unnamed: 0,g1freelunch,g1absent,g1readscore,g1mathscore,g1listeningscore,g1wordscore,group
count,2145.0,2145.0,2145.0,2145.0,2145.0,2145.0,2145.0
mean,0.847151,0.244311,-0.814091,-0.758617,-0.721961,-0.812927,2.0
std,0.528596,1.204693,0.538412,0.692439,0.666748,0.695914,0.0
min,-1.001803,-1.059582,-2.122306,-2.953984,-2.706331,-3.739441,2.0
25%,0.9982,-0.631252,-1.217713,-1.262102,-1.156865,-1.298739,2.0
50%,0.9982,-0.060146,-0.855876,-0.798573,-0.739701,-0.844655,2.0
75%,0.9982,0.796513,-0.475946,-0.265515,-0.262942,-0.295969,2.0
max,0.9982,10.933648,1.948363,1.611779,1.703688,1.179804,2.0


In [9]:
# Four group 3:
# The student test scores much higher than on average in this group, students
# are from advantageous backgrounds as lower proportion of students on free
# or reduced price lunch. This is the "gifted" group.
educ_data[educ_data.group == 3].describe()

Unnamed: 0,g1freelunch,g1absent,g1readscore,g1mathscore,g1listeningscore,g1wordscore,group
count,1411.0,1411.0,1411.0,1411.0,1411.0,1411.0,1411.0
mean,-0.45609,-0.038593,1.28835,1.118996,1.028631,1.168219,3.0
std,0.891172,0.907254,0.632555,0.728524,0.852945,0.492043,0.0
min,-1.001803,-1.059582,-0.494038,-1.424338,-1.09727,-0.409491,3.0
25%,-1.001803,-0.774029,0.772392,0.592015,0.481993,0.68788,3.0
50%,-1.001803,-0.202923,1.242781,1.07872,0.98855,1.179804,3.0
75%,0.9982,0.368183,1.948363,1.611779,1.495106,1.633888,3.0
max,0.9982,6.793128,2.346384,3.350013,4.176874,1.633888,3.0
