# 10 minutes to Data Wrangling

Data wrangling is the process of transforming and structuring data from one raw form into a desired format with the intent of improving data quality and making it more consumable and useful for analytics or machine learning.

In [None]:
import numpy as np

import pandas as pd

## Basic data structures

Pandas provides two types of classes for handling data:

### Series: 
a one-dimensional labeled array holding data of any type
such as integers, strings, Python objects etc.

### DataFrame: 
a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns.

# Object creation
Creating a Series by passing a list of values, letting pandas create a default RangeIndex.

In [None]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

Creating a DataFrame by passing a NumPy array with a datetime index using date_range() and labeled columns:

In [None]:
dates = pd.date_range("20130101", periods=6)
dates

In [None]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values

In [None]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

The columns of the resulting DataFrame have different dtypes:

In [None]:
df2.dtypes

# Viewing data

In [None]:
df.head()

In [None]:
df.tail(3)

In [None]:
df.T #transpose

In [None]:
df.sort_index() #sorts by an axis

In [None]:
df.sort_values(by="B") #sorts by values

# Missing data
Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data:

In [None]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
df1

In [None]:
df1.dropna(how="any") #drops any rows that have missing data:

In [None]:
df1.fillna(value=5) #fills missing data:

In [None]:
pd.isna(df1) #gets the boolean mask where values are nan:

# Join
merge() enables SQL style join types along specific columns. See the Database style joining section.

In [None]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
left

In [None]:
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
right

In [None]:
pd.merge(left, right, on="key")

In [None]:
pd.merge(left, right, on="key") #merge() on unique keys:

More operations at: https://pandas.pydata.org/docs/user_guide/10min.html

# What are examples of order of Pandas operations? 

# 1. Load data from a CSV file
This code imports the Pandas library and reads a CSV file called "data.csv." The data sets from the CSV file are loaded into a DataFrame object called df. 

DataFrames are the primary data structure used in Pandas for storing and manipulating data. Open a command prompt and enter:

import Pandas as pd

df = pd.read_csv("data.csv")

# 2. Selecting specific columns
This code selects two specific columns, "column1" and "column2". from the DataFrame df and creates a new DataFrame called selected_columns, containing only those columns. 

selected_columns = df[["column1","column2"]]

# 3. Filtering rows based on a condition
This code filters the DataFrame df to include only rows where the value in "column1" is greater than 10. 

The filtered rows are stored in a new DataFrame called filtered_rows.

filtered_rows = df[df["column1"]>10]

# 4. Renaming columns
This code renames a column in the DataFrame df by providing a dictionary with the old column name as the key and the new column name as the value. 

The inplace=True argument tells Pandas to perform the renaming operation directly on the original DataFrame, rather than creating a new DataFrame with the updated column names.

Enter the following command:

df.rename(columns={"old_column_name":"new_column_name"}, inplace=True)

# 5. Grouping data by a specific column
This code groups the DataFrame df by the unique values in the "column1" column and calculates the mean of the other columns for each group. 

The resulting grouped data is stored in a new DataFrame called grouped_data.

grouped_data=df.groupby("column1").mean()

# 6. Merging two DataFrames
This code merges two DataFrames, df1 and df2, based on a common column called "common_column". 

The resulting merged DataFrame is stored in a new DataFrame called merged_data.

merged_data=pd.merge(df1,df2,on="common_column")

# 7. Creating a line plot with Pandas and Matplotlib
This code imports the Matplotlib library and uses the built-in Pandas plotting function to create a line plot. 

The x-axis represents the data in the "column1" column, and the y-axis represents the data in the "column2" column. 

The kind="line" argument specifies that the plot should be a line plot. Finally, plt.show() displays the plot.

Enter the following command:

import matplotlib.pyplot as plt

​df.plot(x="column1",y="column2",kind="line")

plt.show()


# IRIS Dataset

https://archive.ics.uci.edu/dataset/53/iris

# ETL

In [None]:
import pandas as pd

iris = pd.read_csv('iris.data',header=None)

In [None]:
iris.head()

In [None]:
iris.info()

In [None]:
iris.describe()

In [None]:
iris.columns = ["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm","Species"]

In [None]:
iris["Species"].value_counts()

In [None]:
iris.plot(kind="scatter", x="SepalLengthCm", y="SepalWidthCm")

In [None]:
import seaborn as sns
sns.pairplot(iris, hue="Species")

# Evaluating Machine Learning Models to predict the species of an iris flower

### Importing the libraries

In [None]:
# data viz
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from itertools import cycle
import pylab as pl

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics

### load dataset from sklearn

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()

In [None]:
# create X (features) and y (response)
X = iris.data
y = iris.target

### K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
# Another way of evaluating the perfomance of our model is using 
# KFold: This approach is better that train/test split
knn = KNeighborsClassifier(n_neighbors=5)

scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')

print(scores.mean())


In [None]:
# search for an optimal value of K for KNN
k_range = list(range(1, 31))
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
print(k_scores)
print("max score:",max(k_scores))
print("K = ",k_scores.index(max(k_scores)))
# print("K = ",len(k_scores)-k_scores[::-1].index(max(k_scores)))

In [None]:
# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000)

# train the model on the training set
logreg.fit(X, y)

print(cross_val_score(logreg, X, y, cv=20, scoring='accuracy').mean())

# logreg = LogisticRegression()
# logreg.fit(X, y)
y_pred = logreg.predict(X)
print(metrics.accuracy_score(y, y_pred))

# Model Pickles

### KNN

In [None]:
import pickle
PickleModelPath = './knn.pkl'

knn.fit(X_train, y_train)

with open(PickleModelPath, 'wb') as f:
        pickle.dump(knn, f)

In [None]:
import _pickle as pickle

PickleModelPath = './knn.pkl'

var1 = [[4.9,3.0,1.4,0.2]]

with open(PickleModelPath, 'rb') as k:
        PickleModel = pickle.load(k)
Answer = PickleModel.predict(var1)

print(Answer)
print(iris.target_names[Answer])

### Logistic Regression

In [None]:
import pickle
PickleModelPath = './logreg.pkl'

with open(PickleModelPath, 'wb') as f:
        pickle.dump(logreg, f)

In [None]:
import _pickle as pickle

PickleModelPath = './logreg.pkl'

var1 = [[4.9,3.0,1.4,0.2]]

with open(PickleModelPath, 'rb') as k:
        PickleModel = pickle.load(k)
Answer = PickleModel.predict(var1)

print(Answer)
print(iris.target_names[Answer])