In [24]:
# -*- coding: utf-8 -*-

import numpy as np
from numpy import random
import matplotlib.pyplot as plt
from collections import namedtuple



In [25]:

def to_array(x):
    """ Convert an vector to array if needed """
    if len(x.shape)==1:
        x=x.reshape(1,x.shape[0])
    return x


def gen_arti(centerx=1,centery=1,sigma=0.1,nbex=1000,data_type=0,epsilon=0.02):
    """ Generateur de donnees,
        :param centerx: centre des gaussiennes
        :param centery:
        :param sigma: des gaussiennes
        :param nbex: nombre d'exemples
        :param data_type: 0: melange 2 gaussiennes, 1: melange 4 gaussiennes, 2:echequier
        :param epsilon: bruit dans les donnees
        :return: data matrice 2d des donnnes,y etiquette des donnnees
    """
    if data_type==0:
         #melange de 2 gaussiennes
         xpos=np.random.multivariate_normal([centerx,centerx],np.diag([sigma,sigma]),nbex/2)
         xneg=np.random.multivariate_normal([-centerx,-centerx],np.diag([sigma,sigma]),nbex/2)
         data=np.vstack((xpos,xneg))
         y=np.hstack((np.ones(nbex/2),-np.ones(nbex/2)))
    if data_type==1:
        #melange de 4 gaussiennes
        xpos=np.vstack((np.random.multivariate_normal([centerx,centerx],np.diag([sigma,sigma]),nbex/4),np.random.multivariate_normal([-centerx,-centerx],np.diag([sigma,sigma]),nbex/4)))
        xneg=np.vstack((np.random.multivariate_normal([-centerx,centerx],np.diag([sigma,sigma]),nbex/4),np.random.multivariate_normal([centerx,-centerx],np.diag([sigma,sigma]),nbex/4)))
        data=np.vstack((xpos,xneg))
        y=np.hstack((np.ones(nbex/2),-np.ones(nbex/2)))

    if data_type==2:
        #echiquier
        data=np.reshape(np.random.uniform(-4,4,2*nbex),(nbex,2))
        y=np.ceil(data[:,0])+np.ceil(data[:,1])
        y=2*(y % 2)-1
    # un peu de bruit
    data[:,0]+=np.random.normal(0,epsilon,nbex)
    data[:,1]+=np.random.normal(0,epsilon,nbex)
    # on mélange les données
    idx = np.random.permutation((range(y.size)))
    data=data[idx,:]
    y=y[idx]
    return data,y

def plot_data(data,labels=None):
    """
    Affiche des donnees 2D
    :param data: matrice des donnees 2d
    :param labels: vecteur des labels (discrets)
    :return:
    """
    cols,marks = ["red", "green", "blue", "orange", "black", "cyan"],[".","+","*","o","x","^"]
    if labels is None:
        plt.scatter(data[:,0],data[:,1],marker="x")
        return
    for i,l in enumerate(sorted(list(set(labels.flatten())))):
        plt.scatter(data[labels==l,0],data[labels==l,1],c=cols[i],marker=marks[i])



def make_grid(data=None,xmin=-5,xmax=5,ymin=-5,ymax=5,step=20):
    """ Cree une grille sous forme de matrice 2d de la liste des points
    :param data: pour calcluler les bornes du graphe
    :param xmin: si pas data, alors bornes du graphe
    :param xmax:
    :param ymin:
    :param ymax:
    :param step: pas de la grille
    :return: une matrice 2d contenant les points de la grille
    """
    if data!=None:
        xmax, xmin, ymax, ymin = np.max(data[:,0]),  np.min(data[:,0]), np.max(data[:,1]), np.min(data[:,1])
    x, y =np.meshgrid(np.arange(xmin,xmax,(xmax-xmin)*1./step), np.arange(ymin,ymax,(ymax-ymin)*1./step))
    grid=np.c_[x.ravel(),y.ravel()]
    return grid, x, y


def plot_frontiere(data,f,step=20):
    """ Trace un graphe de la frontiere de decision de f
    :param data: donnees
    :param f: fonction de decision
    :param step: pas de la grille
    :return:
    """
    grid,x,y=make_grid(data=data,step=step)
    plt.contourf(x,y,f(grid).reshape(x.shape),colors=('gray','blue'),levels=[-1,0,1])


In [26]:

##################################################################"
class Classifier(object):
    """ Classe generique d'un classifieur
        Dispose de 3 méthodes :
            fit pour apprendre
            predict pour predire
            score pour evaluer la precision
    """
    def fit(self,x,y):
        raise NotImplementedError("fit non  implemente")
    def predict(self,x):
        raise NotImplementedError("predict non implemente")
    def score(self,x,y):
        return (self.predict(x)==y).mean()


In [27]:
def load_usps(filename):
    with open (filename, "r" ) as f :
        f.readline()
        data =[ [ float ( x ) for x in l.split()]for l in f if len(l.split())>2]
    tmp = np.array(data)
    return tmp[:,1:],tmp[:,0].astype(int)
datax,datay = load_usps("train.txt")

In [28]:
# datax , datay = gen_arti(data_type = 0 , nbex = 1000)
# plot_data( datax , datay ) 
# plot_frontiere( datax , f , step=20)

In [46]:
class Bayes(Classifier):
    def fit(self,x,y):
        xLabel = np.array([datax[np.where(y==i)] for i in range(len(np.unique(datay)))])
        
        mean = [np.mean(x[:,i]) for i in range(x.shape[1])]
        var = [np.var(x[:,i]) for i in range(x.shape[1])]
        return mean,var    
            
    def predict(self,x,y):
        pass

In [47]:
class Knn(Classifier):
    def fit(self,x,y):
        pass
    def predict(self,x,y):
        pass

In [48]:
b = Bayes()
b.fit(datax,datay)

([0.0035826361267315868,
  0.018862295981346866,
  0.048847071732272665,
  0.11226237827458511,
  0.22653161431902344,
  0.38969757234947194,
  0.63100891510080914,
  0.9542306953778632,
  0.94735948429570693,
  0.71543615416266626,
  0.49589397887806885,
  0.31353037992044991,
  0.18479824441091758,
  0.093844877245919639,
  0.033996845425867513,
  0.0066388698395281852,
  0.010457413249211355,
  0.048667261006720614,
  0.13467686188451516,
  0.30413057193800574,
  0.5661950349746262,
  0.85656247428336307,
  1.1306683582498971,
  1.4118236181593746,
  1.4147754766150049,
  1.1488292415306542,
  0.92574242216431224,
  0.6527920724180496,
  0.38607817857632692,
  0.19993786860512963,
  0.08159895761898231,
  0.02101494993828007,
  0.017231381154848444,
  0.072594980112467442,
  0.19942600466328347,
  0.45016335207790426,
  0.76661719928679195,
  0.97645014401316688,
  1.0660666575229736,
  1.2109718831436016,
  1.1613453572898094,
  0.96467055273625024,
  0.91367480455355921,
  0.78632

In [50]:
xLabel = [datax[np.where(y==i)] for i in range(len(np.unique(datay)))]

In [51]:
xLabel

[array([], shape=(0L, 256L), dtype=float64),
 array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 array([], shape=(0L, 256L), dtype=float64),
 array([], shape=(0L, 256L), dtype=float64),
 array([], shape=(0L, 256L), dtype=float64),
 array([], shape=(0L, 256L), dtype=float64),
 array([], shape=(0L, 256L), dtype=float64),
 array([], shape=(0L, 256L), dtype=float64),
 array([], shape=(0L, 256L), dtype=float64),
 array([], shape=(0L, 256L), dtype=float64)]