In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import statsmodels.api as sm
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.manifold import MDS
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import networkx as nx
from pyvis.network import Network

In [0]:
path = '/mnt/car_brand_attribute_associations/'

In [0]:
df_path = path + "Data/Edmunds_Data_New"
df = spark.read.format('csv').options(header=True,inferSchema=True).load(df_path)
edmunds_data = df.toPandas()
edmunds_data.info()

In [0]:
def generate_corpus(column):
    corpus = " "
    for text in column:
        text = str(text)
        for i in list(string.punctuation.replace('+','')):
            text = text.replace(i, ' ')
        text = text.lower()

        corpus += text
    return corpus

In [0]:
corpus = generate_corpus(edmunds_data['Comment'])
corpus = corpus.split()
print(len(corpus))

word_freq = FreqDist()
for word in tqdm(corpus):
    word_freq[word] += 1

freq_dist = []
for rank, word in enumerate(word_freq):
    freq_dist.append([rank+1, word, word_freq[word]])

freq_dist = pd.DataFrame(freq_dist, columns=['Rank', 'Word', 'Count'])
df = spark.createDataFrame(freq_dist)
df.write.option("header", "true").mode("overwrite").csv(path+ 'Support Files/Freq_Dist_new_comments')
freq_dist

In [0]:
max_rank = max(freq_dist['Rank'])
c = max_rank * int(freq_dist.loc[freq_dist['Rank'] == max_rank]['Count'].values)

l_rank = np.log(freq_dist['Rank'] / c)
l_freq = np.log(freq_dist['Count'])

# X = sm.add_constant(l_rank)
model = sm.OLS(l_freq, l_rank)
result = model.fit()
print(result.summary())

In [0]:
plt.figure(figsize=(20,7))
plt.plot(l_freq, l_rank, label="Raw Zipf's Curve")
plt.plot(result.predict(l_rank), l_rank, label='OLS Fit')
plt.xlabel('Rank')
plt.ylabel('Frequency')
plt.title("Zipf's Distribution")
plt.legend()
plt.show()

In [0]:
def process_column(column):
    data = list(column)
    for i in range(len(data)):
        text = str(data[i])
        for punct in list(string.punctuation.replace('+','')):
             text = text.replace(punct, ' ')
        text = text.lower()
        data[i] = text
    return data

def replace_car_brands(data, map):
    for i in range(len(data)):
        text = data[i]
        l_text = text.split()
        for word in l_text:
            if word in map.keys():
                text = text.replace(word, map[word])
        data[i] = text
    return data

In [0]:
comments = process_column(edmunds_data['Comment'])

df = spark.read.format('csv').options(header=True,inferSchema=True).load(path + 'Support Files/Substitutions/')
brand_model_map = df.toPandas()

model__brand_dict = {}
for brand, model in zip(brand_model_map['Brand'], brand_model_map['Model']):
    model__brand_dict[model] = brand

# print(model__brand_dict['century'])
comments = replace_car_brands(comments, model__brand_dict)

brand_count = {}
for brand in list(brand_model_map['Brand'].unique()):
    brand_count[brand] = 0

for i in range(len(comments)):
    text = comments[i].split()
    for brand in list(brand_count.keys()):
        if brand in text:
            brand_count[brand] += 1


In [0]:
brand_freq = []
for brand, count in zip(brand_count.keys(), brand_count.values()):
    brand_freq.append([brand, count])

brand_freq = pd.DataFrame(brand_freq, columns=['Brand', 'Count'])
brand_freq = brand_freq.sort_values('Count', ascending=False).reset_index(drop=False)
brand_freq

In [0]:
def lift(texts, x, y, words=float('inf')):
    n = len(texts)
    count_x = 0
    count_y = 0
    count_x_y = 0

    for i in range(n):
        x_indices = []
        y_indices = []
        text = texts[i].split()
        
        for j in range(len(text)):
            if x == text[j]:
                x_indices.append(j)
            elif y == text[j]:
                y_indices.append(j)

        n_words = []    
        for x_index in x_indices:
            for y_index in y_indices:
                n_words.append(abs(x_index - y_index) - 1)
        
        # print(len(n_words) > 0, len(x_indices) > 0, len(y_indices) > 0)
        if len(n_words) > 0:
            if float(min(n_words)) <= float(words):
                count_x_y += 1
        if len(x_indices) > 0:
            count_x += 1
        if len(y_indices) > 0:
            count_y += 1
        
        # print(n, count_x_y, count_x, count_y)
    
    lift = (n * count_x_y) / (count_x * count_y)
    return lift

In [0]:
top_10_brands = list(brand_freq[:10]['Brand'])

lift_scores = []
dissimilarity_matrix = []
for i in range(len(top_10_brands)):
    row = []
    row_diss = []
    for j in range(len(top_10_brands)):
        if j!=i:
            lift_score = lift(comments, top_10_brands[i], top_10_brands[j])
            row.append(lift_score)
            try:
                row_diss.append(1/lift_score)
            except ZeroDivisionError:
                row_diss.append(np.inf)
        elif i==j:
            row.append(1)
            row_diss.append(0)
        # else:
        #     row.append(np.nan)
        #     row_diss.append(np.nan)
    
    row.append(top_10_brands[i])
    row_diss.append(top_10_brands[i])
    lift_scores.append(row)
    dissimilarity_matrix.append(row_diss)

cols = top_10_brands.append('Brand')

lift_scores = pd.DataFrame(lift_scores, columns=top_10_brands).set_index('Brand')
display(lift_scores)

dissimilarity_matrix = pd.DataFrame(dissimilarity_matrix, columns=top_10_brands).set_index('Brand')
display(dissimilarity_matrix)

In [0]:
def MDS_map(dissimilarity_matrix, metric=True, title='MDS Plot'):
    mds = MDS(n_components=2, metric=metric, dissimilarity='precomputed', random_state=0)
    pts = mds.fit_transform(dissimilarity_matrix)

    plt.scatter(pts[:,0], pts[:,1], color='silver', s=150)
    for i in range(dissimilarity_matrix.shape[0]):
        plt.annotate(dissimilarity_matrix.index[i], (pts[i,0], pts[i,1]), color='blue')
    plt.title(title)
    plt.axis('off')
    plt.show()

MDS_map(dissimilarity_matrix=dissimilarity_matrix, metric=True, title='MDS Plot - Car Brands')

In [0]:
# features = input("Enter the features as comma seperated values without spaces")
# features = features.split(',')
brand_feature = list(brand_freq[:10]['Brand'])
features = ['performance', 'luxury', 'driving', 'engine', 'handling', 'interior']
for feature in features:
    brand_feature.append(feature)


lift_scores = []
dissimilarity_matrix = []
for i in range(len(brand_feature)):
    row = []
    row_diss = []
    for j in range(len(brand_feature)):
        if j!=i:
            if (brand_feature[i] in top_10_brands) and (brand_feature[j] in top_10_brands):
                lift_score = lift(comments, brand_feature[i], brand_feature[j])
            else:
                lift_score = lift(comments, brand_feature[i], brand_feature[j], 40)
            row.append(lift_score)
            try:
                row_diss.append(1/lift_score)
            except ZeroDivisionError:
                row_diss.append(np.inf)
        elif i==j:
            row.append(1)
            row_diss.append(0)
        # else:
        #     row.append(np.nan)
        #     row_diss.append(np.nan)
    
    row.append(brand_feature[i])
    row_diss.append(brand_feature[i])
    lift_scores.append(row)
    dissimilarity_matrix.append(row_diss)

cols = brand_feature.append('Brand')

lift_scores = pd.DataFrame(lift_scores, columns=brand_feature).set_index('Brand')
display(lift_scores)

dissimilarity_matrix = pd.DataFrame(dissimilarity_matrix, columns=brand_feature).set_index('Brand')
display(dissimilarity_matrix)

## Appendix

In [0]:
def netMap(lift_matrix, engine='plotly'):
    import numpy as np
    import matplotlib.pyplot as plt
    import plotly.express as px
    
    if engine == 'matplotlib':
        fig, ax = plt.subplots()

        names = list(lift_matrix.columns)
        n = len(names)
        r = n/2
        points = {}
        for i in range(n):
            theta = (i * np.pi * 2) / n
            x = np.cos(theta) * r
            y = np.sin(theta) * r
            points[names[i]] = [x, y]
        
        for i in range(n):
            point = points[names[i]]
            ax.scatter(point[0], point[1])
            ax.annotate(names[i], point)
        
        for i in range(n):
            for j in range(n):
                if j>i:
                    lift = lift_scores[names[i]][names[j]]
                    if lift > 1:
                        point1 = points[names[i]]
                        point2 = points[names[j]]

                        Xs = [point1[0], point2[0]]
                        Ys = [point1[1], point2[1]]

                        ax.plot(Xs, Ys, linewidth=lift/5, color='b')
        
        plt.show()
    
    elif engine == 'plotly':
        # fig, ax = plt.subplots()

        names = list(lift_matrix.columns)
        n = len(names)
        r = n/2
        points = {}
        for i in range(n):
            theta = (i * np.pi * 2) / n
            x = np.cos(theta) * r
            y = np.sin(theta) * r
            points[names[i]] = [x, y]
        
        for i in range(n):
            point = points[names[i]]
            # ax.scatter(point[0], point[1])
            fig = px.scatter(x=[0, 1, 2, 3, 4], y=[0, 1, 4, 9, 16])
            # ax.annotate(names[i], point)
        
        # for i in range(n):
        #     for j in range(n):
        #         if j>i:
        #             lift = lift_scores[names[i]][names[j]]
        #             if lift > 1:
        #                 point1 = points[names[i]]
        #                 point2 = points[names[j]]

        #                 Xs = [point1[0], point2[0]]
        #                 Ys = [point1[1], point2[1]]

        #                 ax.plot(Xs, Ys, linewidth=lift/5, color='b')

        fig.show()

a = netMap(lift_scores, 'matplotlib')
a