In [1]:
import os
import random
import numpy as np 
import pandas as pd
import matplotlib as plt

from pandas import DataFrame, array
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle

## 1. Lectura de datos

In [2]:
df_review = pd.read_csv('IMDB Dataset.csv')
df_review.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [40]:
df_review['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

1.1 Separación de datos

In [3]:
df_positive = df_review.loc[df_review['sentiment'] == 'positive']
df_negative = df_review.loc[df_review['sentiment'] == 'negative']

df_positive = df_positive[:2000]
df_negative = df_negative[:2000]

frames = [df_positive, df_negative]
df_final = pd.concat(frames)

In [4]:
X = df_final['review']
y = df_final['sentiment']

df_final.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive


Mezclamos el DataFrame

In [5]:
df_final = df_final.sample(frac=1).reset_index(drop=True)

df_final.head()

Unnamed: 0,review,sentiment
0,"A pity, nobody seems to know this little thril...",positive
1,This movie is lame and not funny at all. The p...,negative
2,This is one of those films that explore the cu...,positive
3,Perfect movies are rare. Even my favorite film...,positive
4,I was a 20 year old college student living wit...,positive


## 2. Vectores de Características

2.1 Parámetros iniciales para SVC

In [6]:
regularization: array = [1, 4, 8, 16, 32, 64, 128, 256]
kernel: array = ['linear', 'rbf', 'poly', 'sigmoid']
degree: array = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
gama: array = ['scale', 'auto']
probability: array = [True, False] 

2.2 Parámetros iniciales para la selección de datos

In [7]:
test_size: array = [0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30, 0.31, 0.32, 0.33, 0.34]
random_state: array = list(range(1, 120))
sufffle: array = [True, False]

## 3. El Algoritmo Genético

3.1 Funciones generadoras de configuraciones

In [8]:
def generateConfigForSVC() -> object:
  return {
    'C': random.choice(regularization), 'kernel': random.choice(kernel),
    'degree': random.choice(degree), 'gamma': random.choice(gama),
    'probability': random.choice(probability)
  }

def generateConfigForSplitData() -> object:
  return {
    'test_size': random.choice(test_size), 'random_state': random.choice(random_state),
    'shuffle': random.choice(sufffle)
  }

def generateChildrenConfig(hiper: list) -> object:
  return {
    'C': hiper[0], 'kernel': hiper[1],
    'degree': hiper[2], 'gamma': hiper[3],
    'probability': hiper[4]
  }
def generateChildrenConfigSplit(hiper: list) -> object:
  return {
    'test_size': hiper[0], 'random_state': hiper[1],
    'shuffle': hiper[2]
  }

3.2 El individuo

In [9]:
class IndiviudalSVCConfig:
  fitness: float 
  config: object
  configSplit: object

  def __init__(self, f: float, conf: object, split: object) -> None:
    self.fitness = f
    self.config = conf
    self.configSplit = split

  def getFitness(self) -> float:
    return float(self.fitness)

  def getConfig(self) -> object:
    return (self.config)
  
  def setFitness(self, fit: float) -> None:
    self.fitness = fit

  def getConfigSplit(self) -> object:
    return (self.configSplit) 

  def setConfigSplit(self, split: object) -> None:
    self.configSplit = split 

3.3 Inciación de la Población

In [10]:
def generatePopulation(nPop: int) -> list:
  population: list = []
  print(f'Generando una Población de {nPop} individuos')
  
  config: object
  splitConf: object
  fitnessDefault: float = 0
  
  for _ in range(int(nPop)):
    config = generateConfigForSVC()
    splitConf = generateConfigForSplitData()
    individuo: IndiviudalSVCConfig = IndiviudalSVCConfig(
      f = fitnessDefault, conf = config, split = splitConf
    )
    population.append(individuo)

  return population

3.4 La Reproducción

In [11]:
def crossoverForIndivudualConfigSVC(fath: IndiviudalSVCConfig, moth: IndiviudalSVCConfig) -> IndiviudalSVCConfig:
    fathConf: list = list(fath.getConfig().values())
    mothConf: list = list(moth.getConfig().values())
    pointCrossover: int = random.randint(0, len(fathConf)-1)
    
    fsplit: list = list(fath.getConfigSplit().values())
    msplit: list = list(moth.getConfigSplit().values())

    for index in range(pointCrossover, len(fathConf)):
        fathConf[index] = mothConf[index]
    
    for index in range(pointCrossover, len(fsplit)):
        fsplit[index] = msplit[index]
    
  
    newConf: object = generateChildrenConfig(fathConf)
    newCOnfSplit: object = generateChildrenConfigSplit(fsplit)
    
    fitnessDefault: float = 0
    
    children: IndiviudalSVCConfig = IndiviudalSVCConfig(
      f = fitnessDefault, conf = newConf, split = newCOnfSplit
    ) 
  
    return children

3.5 La mutación 

In [12]:
def mutationForIndivudualConfigSVC(indiConf: IndiviudalSVCConfig) -> IndiviudalSVCConfig: 
    
  pass

3.6 La selección 

In [90]:
def search(list_prob: list, numforsearch) -> int:
    piso: int = 0
    techo: int = len(list_prob)-1
    itera: int = 0 
    
    while True:
        if piso <= numforsearch <= techo:
            break
        else 
            itera += 1
    
    x = 0
    y = 0
    itera = 0
    while y < len(list_prob):
        if list_prob[x] <= numforsearch <= list_prob[y]:
            break
        else :
            x += 1 
            
    
    return itera
        
            
        

SyntaxError: invalid syntax (2014331450.py, line 9)

In [89]:
def selectBestIndividualsMaxtoMinFitness(population: list, numIndiPadres: int) -> list:
    sum_fitness: float = 0
    for it in population:
        sum_fitness += it.getFitness()
    
    list_proba: list = [None] * len(population)
    for i in range(len(population)):
        list_proba[i] = population[i].getFitness()/sum_fitness
    
    distancia: float = 1.0/numIndiPadres
        
    res: list = []
    
    start = 0;
    end = distancia 
    while len(res) < numIndiPadres:

IndentationError: expected an indented block (1924890373.py, line 18)

In [13]:
def temp_selection(population: list) -> list:
    population.sort(key = lambda fit: fit.fitness, reverse = True)
    rest = population[:4]
    return rest

3.7 La Evaluación

In [14]:
def evaluatePopulation(populat: list) -> None:
  print(f'Evaluando a los individuos tamaño {len(populat)}')
  
  count: int = 0
  xConf: object 
  splitC: object
  
  for indi in populat:
    print(f'Evaluando individuo {count}')
    xConf = indi.getConfig() 
    splitC = indi.getConfigSplit()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, **splitC) 

    tfid = TfidfVectorizer(stop_words = 'english')
    X_train = tfid.fit_transform(X_train)
    X_test = tfid.transform(X_test)
    
    svc = SVC(**xConf)
    svc.fit(X_train, y_train)
    predicted = svc.predict(X_test)
    indi.setFitness(getAccuracyForModel(y_test = y_test, y_pred = predicted))
    count += 1

## 4. Clasificación por Vectores de Soporte (SVC)

4.1 Funciones de obtención de métricas - Accuracy

In [15]:
def getAccuracyForModel(y_test: np.ndarray, y_pred: np.ndarray) -> float:
  return float(metrics.accuracy_score(y_test, y_pred))

## 5. Setup

In [16]:
def geneticAlgorithmInit(nPop: int, epochs: int) -> IndiviudalSVCConfig:
    population: list = generatePopulation(nPop)
        
    for i in range(epochs):
        print(f'EPOOOOCA ------------- {i} \n')
        evaluatePopulation(population)
        candidates = temp_selection(population)
        count: int = 0
        for c in candidates:
            print(f'Individuo {count} : Fitness = {c.getFitness()}')
            count += 1
        print('Cruzando individuos\n')
        pos = 0
        while pos < len(population):
            proba = random.random()
            if proba < 0.8:
                f = random.choice(candidates)
                m = random.choice(candidates)
                population[pos] = crossoverForIndivudualConfigSVC(f, m)
                pos += 1
            else:
                pass
            
    return population[0]

In [17]:
config = geneticAlgorithmInit(10, 4)
fin_conf = config.getConfig()
fin_spli = config.getConfigSplit()

print(f'Config: {fin_conf}')
print(f'Split: {fin_spli}')

Generando una Población de 10 individuos
EPOOOOCA ------------- 0 

Evaluando a los individuos tamaño 10
Evaluando individuo 0
Evaluando individuo 1
Evaluando individuo 2
Evaluando individuo 3
Evaluando individuo 4
Evaluando individuo 5
Evaluando individuo 6
Evaluando individuo 7
Evaluando individuo 8
Evaluando individuo 9
Individuo 0 : Fitness = 0.855
Individuo 1 : Fitness = 0.85
Individuo 2 : Fitness = 0.4967741935483871
Individuo 3 : Fitness = 0.30434782608695654
Cruzando individuos

EPOOOOCA ------------- 1 

Evaluando a los individuos tamaño 10
Evaluando individuo 0
Evaluando individuo 1
Evaluando individuo 2
Evaluando individuo 3
Evaluando individuo 4
Evaluando individuo 5
Evaluando individuo 6
Evaluando individuo 7
Evaluando individuo 8
Evaluando individuo 9
Individuo 0 : Fitness = 0.855
Individuo 1 : Fitness = 0.855
Individuo 2 : Fitness = 0.85
Individuo 3 : Fitness = 0.85
Cruzando individuos

EPOOOOCA ------------- 2 

Evaluando a los individuos tamaño 10
Evaluando individuo 0

DEFECTO

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

tfid = TfidfVectorizer(stop_words = 'english')

X_train = tfid.fit_transform(X_train)
X_test = tfid.transform(X_test)

from sklearn.svm import SVC

print(f'Executing SVC Defecto')

svc = SVC()
svc.fit(X_train, y_train)

print(svc.predict(tfid.transform(['An excellent movie'])))

print(svc.score(X_test, y_test))

Executing SVC Defecto
['positive']
0.85


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3)

tfid = TfidfVectorizer(stop_words = 'english')

X_train = tfid.fit_transform(X_train)
X_test = tfid.transform(X_test)

from sklearn.svm import SVC

print(f'Executing SVC Defecto')

svc = SVC(**fin_conf)
svc.fit(X_train, y_train)

print(svc.predict(tfid.transform(['An excellent movie'])))
print(svc.score(X_test, y_test))

Executing SVC Defecto
['positive']
0.8558333333333333
