# Pokemon Grass vs Water vs Fire Classifier

### Importing libraries and read csv

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.gridspec as gridspec
import seaborn as sns
import os
import tensorflow as tf
from tensorflow import keras as ks


from pylab import rcParams
import sklearn
from sklearn import linear_model
from sklearn.preprocessing import scale
from collections import Counter
from scipy.stats import pearsonr


def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 4)
    return pvalues


from keras.utils import to_categorical
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split


In [108]:

df_original = pd.read_csv('../input/pokemon-images-and-types/pokemon.csv')
print(df_original.shape)
print(df_original.head())
pokemon_names_original_list = df_original['Name'].tolist()


df = pd.read_csv('../input/pokemon-stats/Pokemon stats.csv')
df["Name"] = df["Name"].str.lower()
print(df.shape)
print(df.head())


df = df[df.Name.isin(pokemon_names_original_list)] #removing duplicates (mega evolutions?) and pokemon who didn't exist in the original dataset
print(df.shape)
df.head()



### lets sort the pokemons names in order to keep in match with the available image files

In [109]:
df = df.sort_values(by=['Name'], ascending=True).reset_index(drop=True) #Sorting by name, alphabetically
print(df.head())

df.dtypes # the types of the different columns
hp_attack_defense_speed_df = df[["HP", "Attack", "Defense", "Speed"]] #creating a dataframe which contains only hp, attack, defense and speed since they're all numeric and continous (good for linear regression)
print(hp_attack_defense_speed_df.head())

print(hp_attack_defense_speed_df.isnull().sum().sum())# the result is 0 so there are no missing values here




print(hp_attack_defense_speed_df.corr()) #creating a correlation matrix based on the hp,attack,defense and speed matrix
print(calculate_pvalues(hp_attack_defense_speed_df)) #is p-val calculation working?

sns.pairplot(hp_attack_defense_speed_df) # plotting scatterplots that'll visualize the correlation matrix


#there is a rather strong correlation between hp&attack, attack&defense
reg = linear_model.LinearRegression()
reg.fit(hp_attack_defense_speed_df[["HP", "Defense"]],hp_attack_defense_speed_df.Attack)
print(reg.coef_) # the coefficients are 0.3816505 , 0.36193765 for hp and defense respectively
print(reg.intercept_) #the intercept is 23.14

#predicting attack for an hypothetical pokemon with an hp of 100 and defense of 70:
print(reg.predict([[100,70]])) #the predicted attack is 86.64




In [110]:
type1 = df['Type1'].tolist()
#type2 = df['Type2'].tolist() - prob delete
#typo = type1 + type2 - prob delete

main_type = type1 #a list with all the main types


plt.figure(figsize=(17, 5))

sns.countplot(x=main_type)
plt.title('Frequencies of main types of Pokemons') 
plt.show() # it seems like water, normal, grass and bug types are the most common main types

In [111]:
main_type_class = df.Type1.value_counts().keys() #the different main classes of pokemons sorted from the most frequent to the least (each class appears once)
main_type_freq = df.Type1.value_counts().values # the corresponding frequency of each class 


#x2 = df.Type2.value_counts().keys() - prob delete
#y2 = df.Type2.value_counts().values - prob delete


In [112]:
main_type_freq_chart = pd.DataFrame({'main_type_class':main_type_class, 'main_type_freq':main_type_freq})
main_type_freq_chart.head()
#xy2 = pd.DataFrame({'x2':x2, 'y2': y2}) - prob delete
#xy2.sort_values(by=['x2'],ascending=True, inplace=True) - prob delete

In [113]:
path = '../input/pokemon-images-and-types/images/images/'
fig,((ax1, ax2, ax3, ax4),(ax5, ax6, ax7, ax8)) = plt.subplots(2, 4, figsize=(12, 10))
ax = [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8]
for i in range(8):
    img = mpimg.imread(path+df['Name'][i**3]+'.png')
    ax[i].imshow(img)
    ax[i].set_title(df['Name'][i**3])
    ax[i].axis('off')
plt.tight_layout()
plt.show()
    

### Image Path Dataframe 

In [114]:
path='../input/pokemon-images-and-types/images/images/'
img_name = sorted(os.listdir('../input/pokemon-images-and-types/images/images/')) # list of all the names, sorted alphabetically
img_paths = []
for i in img_name:
       
    if i[:len(i)-4:] in df["Name"].tolist():
        img_paths.append(path + i) # a list with all the paths of the different pokemons (again,alphabetically)
print(len(img_paths))



    


    

In [115]:
df['filepath'] = img_paths #adding colomn of path to the spreadsheet (bug - a mismatch between the size of the dataset and the num of images)

In [116]:

poke_type = []
code = []

for i in range(df.shape[0]): #iterate over the number of pokemons, generating a vector of pokemon classes and a vector of codes
    if (df.iloc[i]['Type1']=='Water'):
        poke_type.append('Water')
        code.append(1)
    elif (df.iloc[i]['Type1']=='Normal'):
        poke_type.append('Normal')
        code.append(2)
    elif (df.iloc[i]['Type1']=='Grass'):
        poke_type.append('Grass')
        code.append(3)
    elif (df.iloc[i]['Type1']=='Bug'):
        poke_type.append('Bug')
        code.append(4)
    else:
        poke_type.append(None)
        code.append(None)


In [117]:
# adding the pokemon type and code vectors to the main spreadsheet
df['main_type'] = poke_type
df['code'] = code
# creating a new spreadsheet based on df by removing missing values in type and keeping only the cols code, type, some stats and filepath

new_df = df.drop(['Type1', 'Type2', 'Name'], axis=1)
new_df = new_df[new_df['type']!='NaN']
new_df.reset_index(drop=True, inplace=True)



In [118]:
new_df.head()

In [119]:
#histogram which shows frequencies of the different pokemon types
sns.countplot(x=new_df.type)
plt.title('Pokemons going for training')
plt.ylabel('Number of images')

## Augmentation

In [None]:
train_datagen = ImageDataGenerator(
    
    rescale=1./255, 
    rotation_range=40, 
    width_shift_range=0.2, 
    height_shift_range=0.2, 
    shear_range=.2, 
    zoom_range=0.2,
    horizontal_flip=True, 
    fill_mode='nearest',
    validation_split=0.1
)


train_generator = train_datagen.flow_from_dataframe(
    new_df, 
    x_col='filepath', y_col='type',
    target_size=(120, 120), #the pictures all have the same size, 120x120
    color_mode='rgba', 
    class_mode='categorical', 
    batch_size=32, 
    shuffle=True, 
    seed=1,
    subset='training'
)

validation_generator = train_datagen.flow_from_dataframe(
    new_df , x_col='filepath', y_col='type',
    target_size=(120, 120),
    color_mode='rgba', 
    class_mode='categorical', 
    batch_size=4, 
    shuffle=True,
    seed=1, 
    subset='validation'
)


In [None]:
image_sample = train_generator.next()[0]

plt.figure(figsize=(10, 10))

for i in range(9):
    plt.subplot(3, 3, i+1)
    plt.imshow(image_sample[i, :, :, :])
    plt.axis('off')
plt.show()
image_sample.shape

## Model

In [None]:
model = ks.models.Sequential()

model.add(ks.layers.Dense(4, input_shape=(120, 120, 4)))

model.add(ks.layers.Conv2D(64, (7, 7), activation='relu'))
model.add(ks.layers.MaxPooling2D(2, 2))
#model.add(ks.layers.Dropout(0.2))

model.add(ks.layers.Conv2D(128, (7, 7), activation='relu'))
model.add(ks.layers.MaxPooling2D(2, 2))
#model.add(ks.layers.Dropout(0.2))


model.add(ks.layers.Conv2D(256, (7, 7), activation='relu'))
model.add(ks.layers.MaxPooling2D(2, 2))
#model.add(ks.layers.Dropout(0.2))

model.add(ks.layers.Conv2D(512, (7, 7), activation='relu'))
model.add(ks.layers.MaxPooling2D(2, 2))
model.add(ks.layers.Dropout(0.2))


model.add(ks.layers.Flatten())

model.add(ks.layers.Dense(1024, activation='relu'))

model.add(ks.layers.Dense(3, activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

In [None]:
model.summary()

In [None]:
model.layers

### Callback to stop the training when a particular accuarcy is reached

In [None]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('acc')>0.5) and (logs.get('val_acc')>0.5):
            print('\n reached 50% accuarcy so stopping training') #chance level in this case is 25%
            self.model.stop_training = True
callbacks = myCallback()


In [None]:
history = model.fit(
    train_generator, 
    validation_data=validation_generator,
    batch_size=20, 
    epochs=100,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(), 
        callbacks
    ]
    
)


In [None]:
plt.style.use('ggplot')

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.figure(figsize=(6, 5))

plt.plot(epochs, acc, 'r', label='training_accuracy')
plt.plot(epochs, val_acc, 'b', label='validation_accuracy')
plt.title('Training and Validation Accuarcy')
plt.xlabel('-----epochs--->')
plt.ylabel('Accuracy')
plt.legend()

plt.figure(figsize=(6, 5))

plt.plot(epochs, loss, 'r', label='training_loss')
plt.plot(epochs, val_loss, 'b', label='validation_loss')
plt.title('Training and Validation Loss')
plt.xlabel('----epochs--->')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
#Logistic regression - multiclass classification

#create a list such that each value is a vector which represents a single picture - data
#we also need to prepare a corresponding list with pokemon main type - target
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(data, target,test_size=0.3)

model = LogisticRegression()
model.fit(data_train,target_train)
model.score(data_test,target_test)

#predicting:
plt.matshow(df)


import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.image as mpimg
    from google.colab import files
    from scipy import misc #to see image
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression

    from PIL import Image
    pil_im = Image.open('papa.png')
    pil_imgray = pil_im.convert('LA')

    img = np.array(list(pil_imgray.getdata(band=0)), float)
    img.shape = (pil_imgray.size[1], pil_imgray.size[0])
    plt.imshow(img)

    for eachRow in img:
      for eachPixel in eachRow:
          x_test.append(sum(eachPixel)/3.0)

In [None]:
df.head()

In [120]:
df = df.dropna(subset=['main_type'])

In [121]:
df.head()