In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../datasets/glass.data.txt")
df = df.drop(columns=['Id'])
df.describe()

Unnamed: 0,Refractive_index,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,1.518365,13.40785,2.684533,1.444907,72.650935,0.497056,8.956963,0.175047,0.057009,2.780374
std,0.003037,0.816604,1.442408,0.49927,0.774546,0.652192,1.423153,0.497219,0.097439,2.103739
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.516523,12.9075,2.115,1.19,72.28,0.1225,8.24,0.0,0.0,1.0
50%,1.51768,13.3,3.48,1.36,72.79,0.555,8.6,0.0,0.0,2.0
75%,1.519157,13.825,3.6,1.63,73.0875,0.61,9.1725,0.0,0.1,3.0
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,7.0


In [2]:
#Maps a list of continuous values to discrete values given an interval
def discretize_column(dataframe_column, intervals):
    c_cut = pd.cut(dataframe_column ,intervals)
    intervals_dict = {intervals: index for index, intervals in enumerate(c_cut.cat.categories)}
    return [intervals_dict[value] for value in c_cut]


In [3]:
def write_arff(df, filename, intervals):

    def write_relation(file, relation):
        file.write("@RELATION {}\n\n".format(relation.lower()))
    
    def write_attribute(file, attr, possible_values):
        file.write("@ATTRIBUTE {} {{{}}}\n".format(attr.lower(), ",".join(map(str,possible_values))))
        
    def write_data(file, df, intervals):
        file.write("\n@DATA\n")
        mapped_columns = []
        #For values that need to be mapped
        for column in df.drop(columns=["Type"]):
            mapped_columns.append(discretize_column(df[column],intervals))
        #Type column is already discrete
        mapped_columns.append(list(df["Type"]))
        file.write(pd.DataFrame(mapped_columns).T.to_csv(header=False,index=False))

        
    file = open(filename,"w")

    #@RELATION
    write_relation(file,"Type")
      
    #@ATTRIBUTE
    for column in df.drop(columns=["Type"]):
        write_attribute(file, column, range(intervals))        
    write_attribute(file, "Type", range(8)[1:])
             
    #@DATA 
    write_data(file, df, intervals)
    
    file.close()

In [4]:
write_arff(df,"../datasets/crystal.arff",20)