In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
from scipy import stats
import sklearn as sk

In [3]:
batting2015_df = pd.read_csv("batting2015.csv",index_col = 0)
batting2015_df = batting2015_df.convert_objects(convert_numeric=True)

pinfo_df = pd.read_csv("player_info_table.csv",index_col=0)
del pinfo_df["Team"]
pinfo_df = pinfo_df.convert_objects(convert_numeric=True)
pinfo_df.head()

batting2015_df = pd.merge(batting2015_df,pinfo_df,on="選手名")
# batting2015_df.drop("Team_x")
# batting2015_df[merged_df["守備"]=="投手"]
batting2015_df.dtypes

Team       object
背番号         int64
選手名        object
打率        float64
試合        float64
打席数       float64
打数        float64
安打        float64
本塁打       float64
打点        float64
盗塁        float64
四球        float64
死球        float64
三振        float64
犠打        float64
併殺打       float64
長打率       float64
出塁率       float64
OPS       float64
RC27      float64
XR27      float64
No.         int64
守備         object
生年月日       object
年齢          int64
年数          int64
身長          int64
体重          int64
血液型        object
出身地        object
年俸(推定)    float64
投          object
打          object
dtype: object

In [4]:
batting2015_df = batting2015_df.dropna() #NaNを消す
at_bat = batting2015_df["打数"]
hit = batting2015_df["安打"]
Iscarp = batting2015_df["Team"]=="広"
Isbat_right = batting2015_df["打"]=="右"
plt.plot(at_bat[-Iscarp],hit[-Iscarp],"o",label="NotCarp")
plt.plot(at_bat[Iscarp],hit[Iscarp],"o",label="Carp")
plt.plot(np.linspace(0,np.max(at_bat),100),np.linspace(0,np.max(at_bat),100)*.3,"--")
plt.legend(bbox_to_anchor=(.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [5]:
IsoD = batting2015_df["出塁率"]-batting2015_df["打率"]
IsoP = batting2015_df["長打率"]-batting2015_df["打率"]
Isover50 = batting2015_df["打席数"]>=50
plt.subplot(1,3,1)
plt.plot(IsoD[Isover50&Isbat_right],IsoP[Isover50&Isbat_right],"o",label="Bat at R")
plt.plot(IsoD[Isover50&-Isbat_right],IsoP[Isover50&-Isbat_right],"o",label="Bat at L")
# plt.plot(IsoD[-Isover50],IsoP[-Isover50],"o")
plt.legend()
plt.xlabel("IsoD")
plt.ylabel("IsoP")

plt.subplot(1,3,2)
plt.plot(batting2015_df["XR27"][Isover50&Isbat_right],IsoP[Isover50&Isbat_right],"o",label="Bat at R")
plt.plot(batting2015_df["XR27"][Isover50&-Isbat_right],IsoP[Isover50&-Isbat_right],"o",label="Bat at L")
plt.subplot(1,3,3)
plt.plot(batting2015_df["体重"][Isover50&Isbat_right],IsoP[Isover50&Isbat_right],"o",label="Bat at R")
plt.plot(batting2015_df["体重"][Isover50&-Isbat_right],IsoP[Isover50&-Isbat_right],"o",label="Bat at L")

In [6]:
batting2015_df["打"][Isbat_right] = "right"
batting2015_df["打"][-Isbat_right] = "left"

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [None]:
d = {
    "Bat":batting2015_df["打"],
#     "at_bat":batting2015_df["打席数"],
    "hit":batting2015_df["安打"],
    "hr":batting2015_df["本塁打"],
#     "bb":batting2015_df["四球"],
#     "deadball":batting2015_df["死球"], 
    "Price":batting2015_df["年俸(推定)"],
    "IsoD":batting2015_df["出塁率"]-batting2015_df["打率"],
    "IsoP":batting2015_df["長打率"]-batting2015_df["打率"],
    "XR27":batting2015_df["XR27"],
    "Weight": batting2015_df["体重"],
    "Height": batting2015_df["身長"]
}
df = pd.DataFrame(data=d)
sns.pairplot(df,hue="Bat")
# plt.show()
plt.savefig("bat_at_left_vs_right.png")

In [None]:
corr = df.corr()
sns.heatmap(corr)
plt.show()

In [None]:
batting2015_df["IsoP"]=IsoP
batting2015_df["IsoD"]=IsoD
batting2015_df[Isover50].sort(columns="IsoP",ascending=False).head(10)["XR27"]

In [5]:
d = {
    "IsoD":IsoD,
    "IsoP":IsoP,
    "at_bat":at_bat,
    "hit":hit,
    "bb":batting2015_df["四球"],
    "deadball":batting2015_df["死球"], 
    "XR27":batting2015_df["XR27"],
    "hr":batting2015_df["本塁打"]
}
my_df = pd.DataFrame(data=d)
pd.tools.plotting.scatter_matrix(my_df,diagonal="kde")
plt.show()

In [6]:
my_df.plot(kind="scatter",x="IsoD",y="IsoP",s=my_df["at_bat"])
plt.show()

In [7]:
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters = 3,init="k-means++",random_state=10)
kmeans_model.fit(my_df[["IsoD","IsoP"]])

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=10, tol=0.0001,
    verbose=0)

In [8]:
labels = kmeans_model.labels_

In [9]:
for i in range(max(labels)+1):
    plt.plot(my_df["IsoD"][labels==i],my_df["IsoP"][labels==i],"o")
plt.show()

In [None]:
kmeans_model.cluster_centers_

In [None]:
pinfo_df = pd.read_csv("player_info_table.csv",index_col=0)
pinfo_df.head()