## Let's model some actual sales data
* After the following imports, read in the file __`data/WA_Fn-UseC_-Sales-Win-Loss.csv`__

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
dat = pd.read_csv("data/WA_Fn-UseC_-Sales-Win-Loss.csv")

## Explore the data a bit

In [None]:
dat.head()

## Select/Clean the data

1. We only want rows that have a "Won" value for the "Opportunity Result" and then we want to drop that column from the filtered data. 
* We need to convert the categorical data into dummy binary elements. Use the function __`get_dummies()`__ function for this purpose.
* Save off the __`Opportunity Amount USD`__ results for comparison purposes but then drop that column
* Split the remaining data into training and test data for our filtered results and the winning amounts (saved data from step 3)

In [None]:
won = dat[dat["Opportunity Result"] == "Won"].drop(["Opportunity Result"], axis=1)

won = pd.get_dummies(won)
won_output = won["Opportunity Amount USD"]
won_filtered = won.drop(["Opportunity Number", "Opportunity Amount USD"],axis=1)

won_train, won_test, won_train_output, won_test_output = train_test_split(won_filtered, won_output, test_size=1000)

In [None]:
won.head()

## Model the data
* Create a __`LinearRegression`__ model and then fit the training data and its output

In [None]:
regr = LinearRegression()
linreg = regr.fit(won_train, won_train_output)

## View results
* Create a __`DataFrame`__ of the coefficients from the regression model
* Sort the values
* Display the values

In [None]:
coef = pd.DataFrame({"coef" : linreg.coef_.tolist()},index=won_train.columns)
coef.sort_values('coef', ascending=False)

## How did we do?
1. Predict the results from the test data
* Measure the predictions against the actual winning results

In [None]:
from sklearn.metrics import mean_absolute_error
y_pred = linreg.predict(won_test)
y_actual = won_test_output
print("MAE : ${:,.2f}".format(mean_absolute_error(y_actual, y_pred)))
print("mean: ${:,.2f}".format(np.mean(y_actual)))