In [None]:
import json
import os
import sys
import fnmatch
import scipy.stats as stats

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import ceil
import numpy as np
from scipy import linalg

def lowess(x, y, f=2. / 3., iter=3):
    """lowess(x, y, f=2./3., iter=3) -> yest
    Lowess smoother: Robust locally weighted regression.
    The lowess function fits a nonparametric regression curve to a scatterplot.
    The arrays x and y contain an equal number of elements; each pair
    (x[i], y[i]) defines a data point in the scatterplot. The function returns
    the estimated (smooth) values of y.
    The smoothing span is given by f. A larger value for f will result in a
    smoother curve. The number of robustifying iterations is given by iter. The
    function will run faster with a smaller number of iterations.
    """
    n = len(x)
    r = int(ceil(f * n))
    h = [np.sort(np.abs(x - x[i]))[r] for i in range(n)]
    w = np.clip(np.abs((x[:, None] - x[None, :]) / h), 0.0, 1.0)
    w = (1 - w ** 3) ** 3
    yest = np.zeros(n)
    delta = np.ones(n)
    for iteration in range(iter):
        for i in range(n):
            weights = delta * w[:, i]
            b = np.array([np.sum(weights * y), np.sum(weights * y * x)])
            A = np.array([[np.sum(weights), np.sum(weights * x)],
                          [np.sum(weights * x), np.sum(weights * x * x)]])
            beta = linalg.solve(A, b)
            yest[i] = beta[0] + beta[1] * x[i]

        residuals = y - yest
        s = np.median(np.abs(residuals))
        delta = np.clip(residuals / (6.0 * s), -1, 1)
        delta = (1 - delta ** 2) ** 2

    return yest
from scipy import ndimage as nd

def fill(data, invalid=None):
    """
    Replace the value of invalid 'data' cells (indicated by 'invalid') 
    by the value of the nearest valid data cell

    Input:
        data:    numpy array of any dimension
        invalid: a binary array of same shape as 'data'. True cells set where data
                 value should be replaced.
                 If None (default), use: invalid  = np.isnan(data)

    Output: 
        Return a filled array. 
    """
    #import numpy as np
    #import scipy.ndimage as nd

    if invalid is None: invalid = np.isnan(data)

    ind = nd.distance_transform_edt(invalid, return_distances=False, return_indices=True)
    return data[tuple(ind)]

In [None]:
df = pd.read_csv('avg_stats.csv')
df.shape

In [None]:
plt.figure(figsize=(3,3))
plt.hexbin(df.Salary,df.Ovr,bins='log',gridsize=15)
plt.ylabel('overall')
plt.xlabel('salary')
plt.tight_layout()
plt.savefig('s-o.png')

In [None]:
plt.figure(figsize=(3,3))
plt.hexbin(df.Salary,df.Pot,bins='log',gridsize=15)
plt.ylabel('potential')
plt.xlabel('salary')
plt.tight_layout()
plt.savefig('s-p.png')

In [None]:
from sklearn import preprocessing,linear_model

In [None]:
fexp = preprocessing.PolynomialFeatures(5)
Xt = fexp.fit_transform(np.array(df.Ovr).reshape((-1,1)))
clf = linear_model.Ridge()
clf.fit(Xt,df.Salary)


In [None]:
x = np.linspace(0,100)
y = clf.predict(fexp.transform(x.reshape((-1,1))))
plt.plot(x,y)
plt.ylim(0,50)

In [None]:
x = df.Ovr
y = df.Pot
data = df.Salary
NX, NY = 10, 10
statistic, xedges, yedges, binnumber = stats.binned_statistic_2d(
    x, y, values=data, statistic='mean',bins=100)
print(statistic)

In [None]:
IM = fill(statistic)
ind = np.triu_indices(100)
IM[ind] = 0
#IM = nd.median_filter(IM,3)
IM = nd.gaussian_filter(IM,2)

IM[ind] = 0
plt.xlim(0,95)
plt.ylim(0,95)
plt.imshow(IM,origin='lower')
plt.colorbar()
plt.xlabel('overall')
plt.ylabel('potential')
plt.title('Mean Salary for BBGM Rating')
plt.tight_layout()
plt.savefig('mean-color.png')

In [None]:
plt.style.use('default')
plt.style.use('fivethirtyeight')
plt.figure(figsize=(6,6))
class nf(float):
    def __repr__(self):
        s = f'{self:.1f}'
        return f'{self:.0f}' if s[-1] == '0' else s
CS = plt.contour(IM,levels=10)
# Recast levels to new class
CS.levels = [nf(val) for val in CS.levels]

# Label levels with specially formatted floats
if plt.rcParams["text.usetex"]:
    fmt = r'%r \%%'
else:
    fmt = '%r'

plt.clabel(CS, CS.levels, inline=True, fmt=fmt, fontsize=16)
plt.grid(True)
plt.xlim(0,95)
plt.ylim(0,95)
plt.title('Mean Salary for BBGM Rating')
plt.xlabel('overall')
plt.ylabel('potential')
plt.tight_layout()

plt.savefig('mean-contour.png')

In [None]:
plt.figure(figsize=(6,6))
df2 = df.sample(5000)
plt.title('Overall v Salary')
plt.scatter(df2.Ovr+0.1*np.random.randn(len(df2.Salary)),df2.Salary+0.1*np.random.randn(len(df2.Salary)),vmin=22,vmax=36,c=df2.Age,s=20,alpha=0.7)
cbar = plt.colorbar()
cbar.ax.set_ylabel('Age', rotation=0)
plt.xlabel('overall')
plt.ylabel('salary')
plt.tight_layout()
plt.savefig('s-o.png')

In [None]:
plt.figure(figsize=(6,6))
df2 = df.sample(5000)
plt.title('Potential v Salary')
plt.scatter(df2.Pot+0.1*np.random.randn(len(df2.Salary)),df2.Salary+0.1*np.random.randn(len(df2.Salary)),vmin=22,vmax=36,c=df2.Age,s=20,alpha=0.7)
cbar = plt.colorbar()
cbar.ax.set_ylabel('Age', rotation=0)
plt.xlabel('potential')
plt.ylabel('salary')
plt.tight_layout()
plt.savefig('s-o2.png')