### Imports modules needed

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import calendar
from scipy.optimize import curve_fit
from sklearn.cluster import KMeans
import itertools
from IPython.display import clear_output


### Reads the csv data and adds new columns containing the week and month of each row


In [2]:
# data from the link: https://thedataincubator.us8.list-manage.com/subscribe/confirm?u=70e04e2160786cdebf3df2567&id=fbf1336bda&e=b835ffc04e
# download it and assign file the path to where you stored it.
file = "C:/Users/rober/Python_notebook/Dataincubator/LinkedIn company likes/temp_datalab_records_linkedin_company.csv"
df = pd.read_csv(file,low_memory=False)
df["as_of_date"] = pd.to_datetime(df["as_of_date"],format="%Y-%m-%d")
df['day_of_week'] = df['as_of_date'].apply(lambda x: x.weekday()) # get the weekday index
df['day_of_week'] = df['day_of_week'].apply(lambda x: calendar.day_name[x])
df['month'] = df.as_of_date.dt.month



### Define some functions we shall use.

In [3]:
# Functions
def get_change_in_likes(df):
    """
    Takes a dataframe containing number of linkedin likes a comapny gets each day
    and finds the number of new likes each company gets in each day of the week
    Inputs:
        df: a dataframe containing lists of comapnies in US by industry and number of linkeein
            likes they have as at each date over a certain timeframe
    Outputs:
        dataframe: A dataframe containing columns in the input df as well as days of the week,
                   number of daily new likes, etc.
    
    """
    if df.empty:
        return(df)
    #converts dates to ordinal for easy computation
    df['ordinal_date'] = df['as_of_date'].apply(lambda x: x.toordinal())
    df["day_difference"] = np.nan
    df["like_difference"] = np.nan
    df["employees_on_platform_difference"] = np.nan
    row_iterator = df.iterrows()
    try:
        _, row = next(row_iterator)  # take first item from row_iterator
    except StopIteration as e:
        return(None)
    for i, _next in row_iterator:
        current_row = row['ordinal_date']
        current_likes = row['followers_count']
        current_employ_likes = row['employees_on_platform']
        
        next_row = _next['ordinal_date']
        next_likes = _next['followers_count']
        next_employ_likes = _next['employees_on_platform']
        current_and_next_low_list = [current_row,next_row]
        row = _next
        #Checks if two neighboring dates are consecutive
        if max(current_and_next_low_list) - min(current_and_next_low_list) == \
        len(current_and_next_low_list) - 1:
            df.loc[i, 'day_difference'] = next_row - current_row
            df.loc[i, "like_difference"] = next_likes - current_likes
            df.loc[i, "employees_on_platform_difference"] = \
            abs(next_employ_likes - current_employ_likes)

        else:
            pass
    # selects rows where day_difference is not null. They satisfy what we want
    df = df[(df["day_difference"].notnull())]
    df = df[(df["like_difference"].notnull())]
    df = df[(df["employees_on_platform_difference"].notnull())]
    return(df)

def get_features(df):
    """
    extract the features needed for machine learning which is the average number of linkedin likes each day of the week.
    Input:
            df(dataframe): a dataframe containing companies, their industries, dates, day, 
            new linked in likes on the date, etc
            
    Output:
            dataframe(python dataframe): A dataframe containing names of companies, the industry they belongs and 
            average number of new likes each days of the week.
    """
    industry_groups = df.groupby("industry")
    industry_names = industry_groups.groups.keys()
    dataframe = pd.DataFrame(columns = ["Industry","Company","Monday","Tuesday","Wednesday","Thursday",\
                                        "Friday","Saturday","Sunday"])
    for industry in industry_names:
        print("At",industry)
        indutry_df = industry_groups.get_group(industry)
        companies = indutry_df.groupby("company_name")
        company_name = companies.groups.keys()
        for company in company_name:
            company_df = companies.get_group(company)
            company_df = company_df[company_df['followers_count']>=1]
            company_df = get_change_in_likes(company_df)
            company_df = company_df.groupby("day_of_week")
            #get the days and sort them in the right order
            weekdays = company_df.groups.keys()
            days = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
            days_abbrev = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
            ordered_weekdays = sorted(weekdays, key=days.index)
            followers_weekly_avg = []
            employees_likes_weekly_avg = []
            for day in ordered_weekdays:
                d = company_df.get_group(day)
                mean_following = d.like_difference.mean()
                mean_employ_likes =d.employees_on_platform_difference.mean()
                followers_weekly_avg.append(mean_following)
                employees_likes_weekly_avg.append(mean_employ_likes)
            #print("At",company)
            if len(followers_weekly_avg) == 7 and sum(followers_weekly_avg) > 7:
                followers_weekly_avg.insert(0,company)
                followers_weekly_avg.insert(0,industry)
                dataframe.loc[len(dataframe)] = followers_weekly_avg
            else:
                continue
    return(dataframe)

def get_kmean_clusters(df,n):
    """
    Uses kmean clustering to classify the companies according to the averaged number of new likes each
    days of the week
    Inputs:
            df(dataframe): A dataframe containing names of companies, the industry they belongs and 
            average number of new likes each days of the week.
            n(integer): number of clusters to use.
    Output:
            cluster_label(array): array of integers representing the cluster each row belongs to
            
    """
    #KMean clustering
    df = df.drop(["Industry","Company"], axis = 1)
    #df = df.drop(["Company","Cluster","Industry"], axis = 1)
    km = KMeans(n_clusters=n, init='k-means++', n_init=10)
    cluster = km.fit(df)
    cluster_label = cluster.labels_
    return(cluster_label)

def classify_companies(df,clusters):
    """
    Classify the companies into three groups: well known, averagely known and little known companies.
    Inputs:
            df(dataframe): A dataframe with the average number of linkedin likes each week.
            and the cluster to which each company belongs to.
            clusters(array): An array of integers representing the cluster each row belongs to.
            
    Outputs: 
             well_known_companies(list): A list of well known companies in each industry.
             averagely_known_companies(list): A list of averagely known companies in each industry.
             little_known_companies(list): A list of little known companies in each industry.
    """
    #group by clusters
    company_clusters = df.groupby("Cluster")
    company_grouping = {}
    for i in set(clusters):
        a_group = company_clusters.get_group(i)
        group_marker = a_group.drop(["Company","Industry"],axis = 1).values.max() #group_maker is the maximum entry in the group 
        companies_in_cluster = list(a_group.Company.values)
        company_grouping[group_marker] = companies_in_cluster
    
    little_known_companies = []
    averagely_known_companies = []
    well_known_companies = []
    sorted_company_marker = sorted(company_grouping.keys())
    for i in range(len((sorted_company_marker))):
        if i==0:
            little_known_companies.append(company_grouping[sorted_company_marker[i]])
        elif i==1 or i==2:
            averagely_known_companies.append(company_grouping[sorted_company_marker[i]])
        elif i>=3 and i<=5:
            well_known_companies.append(company_grouping[sorted_company_marker[i]])
        else:
            print("Error! Number of cluster is more tha 5")
    return(well_known_companies,averagely_known_companies,little_known_companies)

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
dataframe = get_features(df)
industry_groups = dataframe.groupby("Industry")
industry_names = set(list(dataframe.Industry.values))
n = 6 #number of clustering centers
print("Well-known companies in each industry are:\n\n")
for industry in industry_names:
    df = industry_groups.get_group(industry)
    if len(df)>=n:
        clusters = get_kmean_clusters(df,n)
        df["Cluster"]= clusters
        df = df.sort_values(['Cluster'])
        three_group_classification = classify_companies(df,clusters)
    else:
        continue
    
    print(industry,":",tuple(itertools.chain(*three_group_classification[0])),"\n")

At Accounting
At Airlines/Aviation
At Apparel & Fashion
At Apparel &amp; Fashion
At Automotive
At Aviation & Aerospace
At Aviation &amp; Aerospace
At Banking
At Biotechnology
At Broadcast Media
At Building Materials
At Business Supplies and Equipment
At Capital Markets
At Chemicals
At Civil Engineering
At Commercial Real Estate
At Computer & Network Security
At Computer &amp; Network Security
At Computer Games
At Computer Hardware
At Computer Networking
At Computer Software
