## Mahmoud MOhamamdi (800-8683389-mmoham12)
## ITSC 5010- Project

## TensorFlow Classifier

### Data Set : Breast Cancer
### Address :
    https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Original%29
### Feature Columns:            

1.Clump_Thickness 

2.Cell_Size_Uniformity

3.Cell_Shape_Uniformity

4.Marginal_Adhesion

5.Single_Epi_Cell_Size

6.Bare_Nuclei

7.Bland_Chromatin

8.Normal_Nucleoli

9.Mitoses

10.Class (2 for benign, 4 for malignant)

### Label Column 
    Class

In [3]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected

from sklearn import datasets
from sklearn.datasets.mldata import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#import tempfile

import re
import time
import numpy as np
import pandas as pd

In [16]:
class BaseModel(object):
    def __init__(self, sess, dataset_name , n_inputs, n_outputs, \
                 n_neurons, scope, epoch, batch_size, learning_rate):

        self.dataset = dataset_name
        self.n_inputs= n_inputs
        self.n_classes= n_outputs
        #self.n_hidden = n_hidden
        self.n_neurons = n_neurons
        
        self.scope= scope
        
        self.epoch = epoch
        self.batch_size = batch_size
        
        self.learning_rate= learning_rate
        
        self.sess = sess
        
        # Partitioning the dataset  
        X_data , y_labels, y_onehot = self.load_dataset(dataset_name)
        
        print(" Data %s ,  Label %s , OneHot %s " %(X_data.shape , y_labels.shape ,y_onehot.shape ))
        
        self.X_train, self.X_test, self.y_train, self.y_test \
            = train_test_split( X_data , y_labels, test_size = 0.2)
            
        
        self.build_model()

    
    def clean_data(self, data):
    
        #data.fillna(0. , inplace= True)
        data = data.astype(str)
        
        for col in data.columns:
            data[col]=data[col].map(str.strip) # removing all space from the string values

       
    # age: continuous.
        workclass_map={ 'Private':1, 'Self-emp-not-inc':2, 'Self-emp-inc':3, 'Federal-gov':4, 'Local-gov':5
                   , 'State-gov':6, 'Without-pay':7, 'Never-worked':8 }
    # fnlwgt: continuous.
        education_map ={'Bachelors':1, 'Some-college':2, '11th':3, 'HS-grad':4, 'Prof-school':5, 'Assoc-acdm':6
                    , 'Assoc-voc':7, '9th':8, '7th-8th':9, '12th':10, 'Masters':11, '1st-4th':12
                    , '10th':13, 'Doctorate':14, '5th-6th':15, 'Preschool':16 }
    # education-num: continuous.
        marital_status_map={ 'Married-civ-spouse':1, 'Divorced':2, 'Never-married':3, 'Separated':4
                        , 'Widowed':4, 'Married-spouse-absent':6, 'Married-AF-spouse':7}
        occupation_map={ 'Tech-support':1, 'Craft-repair':2, 'Other-service':3, 'Sales':4, 'Exec-managerial':5
                    , 'Prof-specialty':6, 'Handlers-cleaners':7, 'Machine-op-inspct':8, 'Adm-clerical':9
                    , 'Farming-fishing':10, 'Transport-moving':11, 'Priv-house-serv':12
                    , 'Protective-serv':13, 'Armed-Forces':14}
        relationship_map={ 'Wife':1, 'Own-child':2, 'Husband':3, 'Not-in-family':4, 'Other-relative':5, 'Unmarried':6}
        race_map ={'White':1, 'Asian-Pac-Islander':2, 'Amer-Indian-Eskimo':3, 'Other':4, 'Black':5}
        sex_map={'Female':1, 'Male':2}
        # capital-gain: continuous.
        # capital-loss: continuous.
        # hours-per-week: continuous.
        native_country_map ={'United-States':1, 'Cambodia':2, 'England':3, 'Puerto-Rico':4, 'Canada':5
                             , 'Germany':6, 'Outlying-US(Guam-USVI-etc)':7, 'India':8, 'Japan':9, 'Greece':10
                             , 'South':11, 'China':12, 'Cuba':13, 'Iran':14, 'Honduras':15, 'Philippines':16
                             , 'Italy':17, 'Poland':18, 'Jamaica':19, 'Vietnam':20, 'Mexico':21
                             , 'Portugal':21, 'Ireland':22
                             , 'France':23, 'Dominican-Republic':24, 'Laos':25, 'Ecuador':26
                             , 'Taiwan':27, 'Haiti':28, 'Columbia':29
                             , 'Hungary':30, 'Guatemala':31, 'Nicaragua':32, 'Scotland':33
                             , 'Thailand':34, 'Yugoslavia':35
                             , 'El-Salvador':36, 'Trinadad&Tobago':37, 'Peru':38, 'Hong':39, 'Holand-Netherlands':40}

        label_map={'<=50K':0, '>50K':1}
        
        theregex = re.compile(r'[^\d.-]+')

        data.replace(to_replace='?', value ='0') 

        data.replace(inplace =True ,to_replace = {'workclass':workclass_map 
        ,'education':education_map
                         ,'marital-status':marital_status_map 
                         ,'occupation':occupation_map , 'relationship':relationship_map
                          ,'race':race_map , 'sex':sex_map ,'native-country':native_country_map
                            ,'label': label_map
                         } )

        data = data.astype('str').applymap(lambda x: re.sub(r'[^\d.]+', '0', x))  

        data = data.astype('int')

        return pd.DataFrame(data)


    def load_dataset(self, dataset_name):
        
              
        url ="https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
        columns=['age' , 'workclass' ,'fnlwgt','education' ,'education-num','marital-status' ,'occupation', 'relationship'
            ,'race' , 'sex' ,'capital-gain' ,'capital-loss' ,'hours-per-week' ,'native-country','label']

        all_data = pd.read_csv(url , header= None , names=columns, index_col=None)
        
        all_data=all_data[:10000]
        
        all_data.reset_index()
        print(all_data.shape)
        
        cleaned_data = self.clean_data(all_data)
        
        
#         col_names = "id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class"
        
#         col_names= col_names.split(',')
        
#         base_dir = '/home/mmoham12/Projects/DeepLearningHW/'
    
#         df = pd.read_csv(base_dir + data_file, names= col_names)

#         df.replace('?', np.nan, inplace = True)
#         df.dropna(inplace=True)
#         df.drop(['id'], axis = 1, inplace = True)

#         df['class'].replace('2',0, inplace = True)
#         df['class'].replace('4',1, inplace = True)
        
#         df['bare_nuclei'] = pd.to_numeric(df['bare_nuclei'])

#         #df.to_csv("cleaned_data.csv", index = False)

        y= np.array(cleaned_data['label'], dtype= np.int)
               
        X = np.array(cleaned_data.drop('label', axis =1), dtype= np.float)
             
       
        y = y.reshape(y.shape[0])
        
#         if dataset_name == 'breast-cancer':
#             y[y==2]= 0 
#             y[y==4]= 1
        
        y_onehot = np.zeros( (len(y) , self.n_classes), dtype=np.float)
        
        for i, lbl in enumerate(y):
            y_onehot[i, y[i]] = 1.0
            
        return X, y , y_onehot# data , labels
        
      

    def build_model(self):
        
        with tf.name_scope(self.scope) :
            
            inputs_dim= [self.n_inputs]
            
            y_dim = []
            
            self.inputs= tf.placeholder(tf.float32, shape=[None] + inputs_dim,  name='inputs')
            
            self.y = tf.placeholder(tf.float32, shape=[None] + y_dim, name='y')

           # print("self.X_train %s " %(self.X_train.shape ))
            
            # Hidden Layer with ReLU Activation function as default
            
            inputs = fully_connected(self.X_train, self.n_neurons )
            
            print("inputs %s " %(inputs.shape ))
            
            hidden1 = fully_connected(inputs, self.n_neurons )
            
            print("Hidern 1 %s " %(hidden1.shape))

            # Last Layer of model without applying Activation function: Logits
            logits = fully_connected(hidden1, self.n_classes, activation_fn=None)

            # Defining Loss function based on Entropy
            
            print("Logits %s , Label %s" %(logits.shape , self.y_train.shape))
            
            
           # xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels= self.y_train , logits= logits)
            xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits= logits ,labels= self.y_train )

            self.cost = tf.reduce_mean(xentropy, name="cost") # Average of all logits
            
            self.saver = tf.train.Saver()
            
            # Evaluation of Logits
            
            evals = tf.nn.in_top_k(tf.cast(logits , tf.float32), self.y_train, 1)
            
            self.accuracy = tf.reduce_mean( tf.cast(evals , tf.float32))

        

    def train(self):
        print("Start Training...\n")
        
        optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.cost)
        
               
        tf.global_variables_initializer().run(session= self.sess)
                    
        num_batches = len(self.X_train) // self.batch_size 
        
        
        start_time = time.time()
        
        for epoch in range(self.epoch):
            
            #print("Epoch Index"+ epoch)
            
            for idx in range(num_batches):
                
                #print("Batch Index"+ idx)
                
                batch_data = self.X_train[idx *  self.batch_size: (idx+1) *  self.batch_size] 
                batch_labeles= self.y_train[idx *  self.batch_size: (idx+1) *  self.batch_size] 
                
                #print("Batch Data %s , Batch Label %s" %(batch_data.shape , batch_labeles.shape))
                
                with self.sess.as_default():
                    
                    self.sess.run([optimizer],
                        feed_dict={self.inputs: batch_data, 
                                   self.y:      batch_labeles  
                                  })
                
#                     acc_train = self.accuracy.eval( feed_dict={
#                     self.inputs: batch_data, 
#                     self.y:      batch_labeles  
#                      })
                
                    acc_test  = self.accuracy.eval( feed_dict={
                    self.inputs: self.X_test, 
                    self.y:      self.y_test  
                                                          
                    })
            
            
            #if (epoch+1) % train_show == 0:

            cost = self.cost.eval(session= sess, feed_dict= {self.inputs: self.X_train , self.y: self.y_train})


            print ("Epoch:[%02d], Batch :[%2d / %3d],cost= %.4f " % ( epoch+1 ,idx+1,\
                 num_batches,cost))
                
#             print("Dataset:[%s]-> Epoch:[%2d], time: %4.4f, Accuracy: %.6f"
#                   % (self.dataset, epoch+1,
#                      time.time() - start_time, acc_test))
            
#             print("Dataset:[%s]-> Epoch:[%2d], Batch :[%2d/%3d] time: %4.4f, Accuracy: %.6f"
#                   % (self.dataset, epoch+1, idx, num_batches,
#                      time.time() - start_time, acc_test))

            

In [18]:
#def main():
    
with tf.Session() as sess:
        nn_obj = BaseModel(
            sess,
            dataset_name ='Adult',
            scope = 'Project', 
            epoch = 15, 
            n_inputs = 14, # Adult has  14 features
            n_outputs = 2,# Adult has  2 classes            
            n_neurons = 20,           
            batch_size= 50, 
            learning_rate = 0.1
            )
        
nn_obj.train()

(10000, 15)
 Data (10000, 14) ,  Label (10000,) , OneHot (10000, 2) 
inputs (8000, 20) 
Hidern 1 (8000, 20) 
Logits (8000, 2) , Label (8000,)
Start Training...

Epoch:[01], Batch :[160 / 160],cost= 0.5514 
Epoch:[02], Batch :[160 / 160],cost= 0.5514 
Epoch:[03], Batch :[160 / 160],cost= 0.5514 
Epoch:[04], Batch :[160 / 160],cost= 0.5514 
Epoch:[05], Batch :[160 / 160],cost= 0.5514 
Epoch:[06], Batch :[160 / 160],cost= 0.5514 
Epoch:[07], Batch :[160 / 160],cost= 0.5514 
Epoch:[08], Batch :[160 / 160],cost= 0.5514 
Epoch:[09], Batch :[160 / 160],cost= 0.5514 
Epoch:[10], Batch :[160 / 160],cost= 0.5514 
Epoch:[11], Batch :[160 / 160],cost= 0.5514 
Epoch:[12], Batch :[160 / 160],cost= 0.5514 
Epoch:[13], Batch :[160 / 160],cost= 0.5514 
Epoch:[14], Batch :[160 / 160],cost= 0.5514 
Epoch:[15], Batch :[160 / 160],cost= 0.5514 
