In [73]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA

# Ridge regression & vectorizer

In [2]:
data = pd.read_csv("C:/Users/U_M0SLV/Downloads/salary-train.csv")

In [59]:
def transform_data(dataset):
    for c in dataset.columns[:-1]: 
        dataset[c] = dataset[c].apply(lambda x: str(x).lower())
    dataset['FullDescription'] = dataset['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
    dataset['LocationNormalized'] = dataset['LocationNormalized'].replace('[^a-zA-Z0-9]', ' ', regex = True)
    dataset['ContractTime'] = dataset['ContractTime'].apply(lambda x: np.nan if x == 'nan' else x)
    dataset['LocationNormalized'].fillna('nan', inplace=True)
    dataset['ContractTime'].fillna('nan', inplace=True)
    return dataset

In [24]:
data = transform_data(data)

In [66]:
tfidf_vectorizer = TfidfVectorizer(min_df=5)
tfidf = tfidf_vectorizer.fit_transform(data['FullDescription'])

In [68]:
dvect_vectorizer = DictVectorizer()
dvect = dvect_vectorizer.fit_transform(data[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [46]:
arr = hstack([tfidf, dvect])

In [49]:
clf = Ridge(alpha=1.0)

In [51]:
model = clf.fit(arr, data["SalaryNormalized"])

In [52]:
data_test = pd.read_csv("C:/Users/U_M0SLV/Downloads/salary-test-mini.csv")

In [69]:
tfidf_test = tfidf_vectorizer.transform(data_test['FullDescription'])
dvect_test = dvect_vectorizer.transform(data_test[['LocationNormalized', 'ContractTime']].to_dict('records'))
arr_test = hstack([tfidf_test, dvect_test])

In [70]:
prediction = model.predict(arr_test)

In [71]:
prediction

array([ 56563.30950388,  37135.78355465])

# PCA

In [76]:
data = pd.read_csv("C:/Users/U_M0SLV/Downloads/close_prices.csv")
djia = pd.read_csv("C:/Users/U_M0SLV/Downloads/djia_index.csv")

In [82]:
result = pd.concat([data, djia], axis=1, join='inner')

In [93]:
pca = PCA(n_components=10)
model = pca.fit(data[[col for col in data.columns if col <> 'date']])

In [98]:
print model.explained_variance_ratio_[:3].sum(), model.explained_variance_ratio_[:4].sum()

0.898993755584 0.927742953784


In [99]:
red = model.transform(data[[col for col in data.columns if col <> 'date']])

In [108]:
first_component = [a[0] for a in red]

In [115]:
print "Correlation between Y and the first component: {0:.2f}".format(np.corrcoef(djia['^DJI'], first_component)[1][0])

Correlation between Y and the first component: 0.91


In [130]:
#the maximum-variance component
ind=np.argwhere(model.explained_variance_ratio_ == model.explained_variance_ratio_.max())[0][0]
#the feature having max value for the maximum-variance component
ind =  np.argwhere(model.components_[ind] == model.components_[ind].max())[0][0]

In [145]:
print "The company having the maximum weight in the maximum component: {0}".format(data.columns[ind+1][0][0])

The company having the maximum weight in the maximum component: V
