In [393]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import text
import tldextract
from matplotlib import pyplot as plt

In [394]:
pd.set_option('display.max_rows', None)

In [395]:
df = pd.read_csv('history.csv', parse_dates = True)
df = df.drop(['id','lastVisitTime', 'lastVisitTimeTimestamp', 'typedCount', 'visitCount'], axis = 1)

In [396]:
# Extract domains from URL
domains = df.url.apply(lambda x: tldextract.extract(x).domain)
df = pd.concat([df, domains.rename('domains')], axis = 1)

def f(x):
    t = str(x.title).lower().strip()
    if t != x.domains:
        x.title = t.replace(x.domains, "")
    if x.title == 'nan':
        x.title = x.domains
    return x

df = df.apply(f, axis = 1)

In [397]:
# Define Stop Words
my_stop_words = text.ENGLISH_STOP_WORDS.union(["suche", "search"])
cv = CountVectorizer(stop_words=my_stop_words)


In [398]:
# Get Distance between tags: Number of docs where they occur together

def freq_matrix(df):
    # Get matrix representation of CountVectorizer
    X = cv.fit_transform(df[~(pd.isnull(df.title))].title)
    X = X.toarray()
    X = pd.DataFrame(X)
    feature_names = list(np.array(cv.get_feature_names()))
    X.columns = feature_names
    return X

freq_matrix = freq_matrix(df)

def distance(w1, w2):
    return np.inner(freq_matrix[w1], freq_matrix[w2])

# Returns a dataframe (shape = (len(word_array), 1)) containing neighbors of each word in word_array

def get_neighbors(word_array):
    neighbors = pd.DataFrame(index = word_array, columns = ["neighbors"])
    for w in word_array:
        w_neighbors = {}
        for v in word_array:
            if v != w and distance(w,v) > 0:
                w_neighbors[v] = distance(w,v)
        neighbors.loc[w, "neighbors"] = [{k: v} for k, v in sorted(w_neighbors.items(), key=lambda item: item[1], reverse = True)]
    return neighbors

In [399]:
# Get tags
def get_tags(array):
    X = cv.fit_transform(array)
    feature_names = list(np.array(cv.get_feature_names()))
    freq_word = list(np.sum(X.toarray(), axis = 0))
    freq_count = {feature_names[i]: freq_word[i] for i in range(0, len(feature_names))}
    freq_count = pd.DataFrame([[k,v] for k, v in sorted(freq_count.items(), key=lambda item: item[1], reverse = True)])
    freq_count.columns = ["Tag", "Frequency"]
    return freq_count

In [413]:
# Filter rows

my_filter = ["google"]

def filter_rows(df, my_filter, gate):
        
    def check(row,my_filter, gate):
        if gate == "And":
            for f in my_filter:
                if f not in str(row.title).lower():
                    return False
            return True
        else:
            for f in my_filter:
                if f in str(row.title).lower():
                    return True
            return False
            
            

    filtered_rows = df[df.apply(lambda x: check(x, my_filter, gate), axis = 1)]
    filtered_rows = filtered_rows[~pd.isnull(filtered_rows.title)]
    
    return filtered_rows

In [412]:
Neighbors = get_neighbors(list(get_tags(filter_rows(df, [""], "And").title).head(200).Tag))

In [387]:
print(Neighbors.loc["machine", "neighbors"])

[{'learning': 339}, {'titanic': 66}, {'data': 50}, {'jobs': 44}, {'mathematics': 27}, {'opencourseware': 25}, {'home': 23}, {'neural': 21}, {'analysis': 21}, {'science': 19}, {'google': 18}, {'new': 13}, {'chrome': 12}, {'ai': 11}, {'stack': 10}, {'india': 10}, {'video': 10}, {'10': 10}, {'python': 9}, {'university': 9}, {'com': 9}, {'overflow': 8}, {'learn': 7}, {'discussions': 6}, {'master': 6}, {'course': 6}, {'online': 5}, {'regression': 5}, {'web': 4}, {'linear': 4}, {'courses': 3}, {'store': 3}, {'networks': 2}, {'ashoka': 2}, {'algorithms': 2}, {'exchange': 2}, {'best': 2}, {'computer': 2}, {'algebra': 2}, {'logistic': 2}, {'using': 2}, {'stanford': 1}, {'documentation': 1}, {'deeplearning': 1}, {'deep': 1}, {'kshitij': 1}, {'kapoor': 1}, {'test': 1}, {'mail': 1}, {'2020': 1}, {'string': 1}, {'alumni': 1}, {'edu': 1}, {'algorithm': 1}, {'cross': 1}]


In [418]:
filter_rows(df, ["pandas", "regression"], gate = "or")

Unnamed: 0,title,url,domains
137,"learn python, data viz, pandas & more | tutori...",https://www.kaggle.com/learn/overview,kaggle
509,pandas.dataframe.sample — pandas 1.0.4 documen...,https://pandas.pydata.org/pandas-docs/stable/r...,pydata
510,.random pandas - -suche,https://www.google.com/search?q=.random+pandas...,google
629,pandas.dataframe.aggregate — pandas 1.0.4 docu...,chrome-extension://klbibkeccnjlkjkiokjodocebaj...,klbibkeccnjlkjkiokjodocebajanakg
635,pandas.dataframe.mode — pandas 1.0.4 documenta...,chrome-extension://klbibkeccnjlkjkiokjodocebaj...,klbibkeccnjlkjkiokjodocebajanakg
651,pandas.dataframe.mode — pandas 1.0.4 documenta...,https://pandas.pydata.org/pandas-docs/stable/r...,pydata
652,pandas.dataframe.aggregate — pandas 1.0.4 docu...,https://pandas.pydata.org/pandas-docs/stable/r...,pydata
653,aggregate pandas - -suche,https://www.google.com/search?q=aggregate+pand...,google
654,.mode pandas - -suche,https://www.google.com/search?q=.mode+pandas&o...,google
655,pandas.concat — pandas 1.0.4 documentation,https://pandas.pydata.org/pandas-docs/stable/r...,pydata
