# TESTING MACHINE LEARNING TOOLS FOR PYTHON

## 1. TEST DATA VISUALIZATION TOOLS - MATPLOTLIB

In [1]:
import matplotlib.pyplot as plt

In [3]:
year = [1950, 1970,1990, 2012]

In [4]:
pop = [2.519, 3.692, 5.263, 6.972]

### 2.1. Plotting data with line chart and scatter

In [5]:
# Plot the data as line chart
plt.plot(year, pop)
plt.show()

In [6]:
# Plot the data as scatter
plt.scatter(year, pop)
plt.show()

### 2.2. Plotting data with histogram

In [7]:
# Plot histagram 
values = [0, 0.6, 0.4, 2, 3]
plt.hist(values, bins = 3)

# Display
plt.show()
plt.clf()


### 2.3. Customization

In [8]:
# Add labels
plt.plot(year, pop)
plt.xlabel('Year')
plt.ylabel('Population')

# Add title
plt.title('World Population Projections')

# Add y ticks
plt.yticks([0,2,4,6,8,10],
          ['0', '2B', '4B', '6B', '8B, 10B'])

plt.show()

## 2. TEST MACHINE LEARNING LIBRARY - SCIKIT-LEARN

### 2.1. Data preprocessing with Scikit-learn

#### 2.1.2 Dealing with missing data

In [11]:
# Create a sample data with missing ones in CSV file
import pandas as pd
from io import StringIO
csv_data = '''A,B,C,D\n1.0,2.0,3.0,4.0\n5.0,6.0,,8.0\n0.0,11.0,12.0,'''

# StringIO allows us to read the string assigned to csv_data as if it was a regular CSV file
df = pd.read_csv(StringIO(csv_data)) 

# Print out
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [7]:
# Return the number of missing values per column
df.isnull().sum()


A    0
B    0
C    1
D    1
dtype: int64

#### 2.1.2. Eliminating samples or features with missing values

In [12]:
# Remove rows with missing values
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [14]:
# Remove columns that have at least one NaN in any row
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,0.0,11.0


In [15]:
# Only remove rows where all columns are NaN
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [16]:
# Remove rows where NaN appear in the specific columns
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,0.0,11.0,12.0,
