# TESTING MACHINE LEARNING TOOLS FOR PYTHON

## 1. TEST DATA VISUALIZATION TOOLS - MATPLOTLIB

In [1]:
import matplotlib.pyplot as plt

In [3]:
year = [1950, 1970,1990, 2012]

In [4]:
pop = [2.519, 3.692, 5.263, 6.972]

### 1.1. Plotting data with line chart and scatter

In [5]:
# Plot the data as line chart
plt.plot(year, pop)
plt.show()

In [6]:
# Plot the data as scatter
plt.scatter(year, pop)
plt.show()

### 1.2. Plotting data with histogram

In [7]:
# Plot histagram 
values = [0, 0.6, 0.4, 2, 3]
plt.hist(values, bins = 3)

# Display
plt.show()
plt.clf()


### 1.3. Customization

In [8]:
# Add labels
plt.plot(year, pop)
plt.xlabel('Year')
plt.ylabel('Population')

# Add title
plt.title('World Population Projections')

# Add y ticks
plt.yticks([0,2,4,6,8,10],
          ['0', '2B', '4B', '6B', '8B, 10B'])

plt.show()

## 2. TEST MACHINE LEARNING LIBRARY - SCIKIT-LEARN

### 2.1. Data preprocessing with Scikit-learn

#### 2.1.2 Dealing with missing data

In [12]:
# Create a sample data with missing ones in CSV file
import pandas as pd
from io import StringIO
csv_data = '''A,B,C,D\n1.0,2.0,3.0,4.0\n5.0,6.0,,8.0\n0.0,11.0,12.0,'''

# StringIO allows us to read the string assigned to csv_data as if it was a regular CSV file
df = pd.read_csv(StringIO(csv_data)) 

# Print out
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [14]:
# Return the number of missing values per column
df.isnull().sum()


A    0
B    0
C    1
D    1
dtype: int64

#### 2.1.2. Eliminating samples or features with missing values

In [15]:
# Remove rows with missing values
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [16]:
# Remove columns that have at least one NaN in any row
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,0.0,11.0


In [17]:
# Only remove rows where all columns are NaN
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [18]:
# Remove rows where NaN appear in the specific columns
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,0.0,11.0,12.0,


#### 2.1.2. Imputing missing data

In [19]:
# Using interpolation techniques rather than removing the entire sample or feature 
# which can waste lots of valuable data
# Key idea: estimate missing values from the other training samples in data set

# Mean imputation: simple replace the missing value by the mean value of entire feature column
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputed_data = imr.fit(df)
imputed_data = imr.transform(df.values)
imputed_data

# Median imputation: set strategy in Imputer as 'median'
# Most frequent: set stategy in Imputer as 'most_frequent'

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [  0. ,  11. ,  12. ,   6. ]])

#### 2.1.3. Handling categorical data

In [20]:
# Create sample data
import pandas as pd
df = pd.DataFrame([['green','M', 10.1,'class1'],
                   ['red', 'L', 13.5, 'class2'], 
                   ['blue', 'XL', 15.3, 'class1']])
df.columns = ['colors', 'size', 'price', 'classlabel']
df

Unnamed: 0,colors,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


#### 2.1.4. Mapping ordinal features

In [21]:
# Mapping ordinal features to numerical features
size_mapping = {
    'XL': 3,
    'L': 2,
    'M': 1
}

df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,colors,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


#### 2.1.5. Encoding class labels

In [22]:
# Encoding class labels into numeric values
import numpy as np
class_maping = {label:idx for idx,label in 
               enumerate(np.unique(df['classlabel']))}
class_maping

df['classlabel'] = df['classlabel'].map(class_maping)
df

Unnamed: 0,colors,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [23]:
# Alternative from scikit-learn that achieves the same
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
df['classlabel'] = y
df

Unnamed: 0,colors,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


#### 2.1.6**. Partitioning a dataset in training and test sets

In [24]:
# Get the data from public 
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns =['Class label', 'Alcohol',
                  'Malic acid', 'Ash',
                  'Alcalinity of ash', 'Magnesium',
                  'Total phenols', 'Flavanoids',
                  'Nonflavanoid phenols',
                  'Proanthocyanins',
                  'Color intensity', 'Hue',
                  'OD280/OD315 of diluted wines',
                  'Proline']

df_wine.head()

URLError: <urlopen error [Errno -2] Name or service not known>

In [9]:
# Randomly partition into training and test sets
from sklearn.cross_validation import train_test_split

# Assign NumPy array representation of feature columns 1-13 to the variable x
# Assign NumPy array the class labels from the first column to the variable y
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 

NameError: name 'df_wine' is not defined