In [1]:
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns
import numpy as np


import re  
import nltk
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

In [2]:
df = pd.read_csv('../notebooks/nlp_iphones_cleaned.csv')

In [3]:
df = df.drop(columns='Unnamed: 0')

In [4]:
df

Unnamed: 0,title,price,condition,condition_description,num_reviews,pos_feedback_pct,return_days,model,capacity,release_year
0,apple iphone 6s (excellent condition) factory ...,135.99,1.0,“fully functional in excellent condition. our ...,30474.0,99.9,2.0,8,57.492355,2015
1,apple iphone 8 64gb gsm factory unlocked smart...,149.99,1.0,“overall good condition will have marks and sc...,13110.0,96.2,2.0,11,64.000000,2017
2,apple iphone 8 plus 256gb unlocked straight ta...,259.00,4.0,““pick your carrier or pick fully unlocked to ...,61033.0,100.0,2.0,11,256.000000,2017
3,apple iphone 7 32gb gsm factory unlocked 4g lt...,107.99,1.0,“overall good working condition and will have ...,13110.0,96.2,2.0,10,32.000000,2016
4,apple iphone x 64gb factory unlocked phone - v...,209.95,3.0,"“this is a b+ stock item, meaning unit is in v...",332402.0,98.4,3.0,12,64.000000,2017
...,...,...,...,...,...,...,...,...,...,...
4141,apple iphone 11 pro 64gb silver verizon unlock...,456.99,3.0,“device is 100% fully functional and in very g...,54837.0,99.4,3.0,15,64.000000,2019
4142,iphone 11 xfinity 128gb white | near mint,335.00,1.0,“near mint condition: screen is 100% flawless....,22050.0,99.6,3.0,15,128.000000,2019
4143,apple iphone 11 pro max 512gb black - unlocked...,591.00,1.0,“this is a used device with real photos of the...,80257.0,99.7,2.0,15,512.000000,2019
4144,apple iphone 7 - 128gb - black - factory unloc...,149.99,1.0,“this device powers on and is 100% functional ...,4258.0,99.8,2.0,10,128.000000,2016


In [5]:
def process_description(description, min_length):
    stop_words = stopwords.words('english')

    wnl = WordNetLemmatizer()

    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
        
    description_lower = description.lower()
    description_lower = re.sub(r"@[a-z0-9_]+|#[a-z0-9_]+|http\S+", "", description_lower).strip().replace("\r", "").replace("\n", "").replace("\t", "")

    description_norm = [x for x in word_tokenize(description_lower) if ((x.isalpha()) & (x not in stop_words)) ]

    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(description_norm))) 

    if len(wordnet_tagged) <= min_length:
        return ''
    else:
        # rejoins lemmatized sentence 
        description_norm = " ".join([wnl.lemmatize(x[0], x[1]) for x in wordnet_tagged if x[1] is not None])
        return description_norm

In [None]:
df['condition_description'] = df['condition_description'].apply(process_description, args=[5])
df['title'] = df['title'].apply(process_description, args=[5])

In [None]:
df

In [None]:
sia = SentimentIntensityAnalyzer()

In [None]:
# no data leakage. using pretrained model.
sia.polarity_scores(df['condition_description'][0])

In [None]:
polarity_list = []
for row in df['condition_description']:
    polarity_list.append(sia.polarity_scores(row))

condition_polarity = pd.DataFrame(polarity_list)
condition_polarity.rename(columns={'neg':'condition_neg', 'neu':'condition_neu', 'pos':'condition_pos', 'compound':'condition_compound'}, inplace=True)

In [None]:
condition_polarity

In [None]:
polarity_list = []
for row in df['title']:
    polarity_list.append(sia.polarity_scores(row))

title_polarity = pd.DataFrame(polarity_list)
title_polarity.rename(columns={'neg':'title_neg', 'neu':'title_neu', 'pos':'title_pos', 'compound':'title_compound'}, inplace=True)

In [None]:
polarity_df = pd.merge(condition_polarity, title_polarity, left_index=True, right_index=True)

In [None]:
df = pd.merge(df, polarity_df, left_index=True, right_index=True)

In [None]:
sns.heatmap(df.corr())

In [None]:
sns.scatterplot(y='price', x='condition_compound', hue='condition', data=df)
plt.show()

In [None]:
# sns.pairplot(hue='model', data=df)

In [None]:
plt.hist(df['price'])

In [None]:
df = df[df['price']>40]

In [None]:
plt.hist(df['price'])

In [None]:
df.columns

In [None]:
X = df.drop(columns = ['price', 'title'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=305)

In [None]:
nlp_cols = 'condition_description'
num_cols = ['num_reviews', 'pos_feedback_pct', 'capacity', 'condition_neg', 'condition_neu', 'condition_pos', 'condition_compound',
       'title_neg', 'title_neu', 'title_pos', 'title_compound']
cat_cols = ['condition', 'return_days', 'model', 'capacity', 'release_year']

In [None]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,3))

In [None]:
tfidf.fit_transform(X_train['condition_description']).toarray().shape


In [None]:
tfidf.get_feature_names()

In [None]:
numeric_pipeline = Pipeline([('numnorm', StandardScaler())])

categoric_pipeline = Pipeline([('ordnorm', StandardScaler())])

nlp_pipeline = Pipeline([('nlpvect', TfidfVectorizer(stop_words = 'english', ngram_range=(1,3)))])

In [None]:
ct = ColumnTransformer(
    [ ("nlppipeline", nlp_pipeline, nlp_cols),
      ("numericpipe", numeric_pipeline, num_cols),
      ("categoricpipeline", categoric_pipeline, cat_cols)
    ]
)

In [None]:
ct

In [None]:
params = {
    "n_estimators": 500,
    "max_depth": 6,
    "min_samples_split": 5,
    "learning_rate": 0.02,
    "loss": 'lad'
}

xgboostreg = GradientBoostingRegressor(**params)

In [None]:
final_pipe = Pipeline([('preprocess', ct),
                      ('model', xgboostreg)])
final_pipe

In [None]:
final_pipe.fit(X_train, y_train)

In [398]:
y_pred = final_pipe.predict(X_test)

In [399]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r_squared = r2_score(y_test, y_pred)

print('RMSE: '+str(rmse))
print('R-Squared: '+str(r_squared))

RMSE: 54.05906276734193
R-Squared: 0.9162356933036253
