In [30]:
import pandas as pd
import numpy as np


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split


In [33]:

df = pd.read_excel("train_data.xlsx",usecols= [0,1])
target_name = df['Domains+Events'].unique().tolist()

In [34]:
df['Id'] = df['Domains+Events'].factorize()[0]

In [35]:
category_id_df = df[['Domains+Events', 'Id']].drop_duplicates().sort_values('Id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['Id', 'Domains+Events']].values)

In [36]:
stopword = text.ENGLISH_STOP_WORDS.difference(["AI", "ai"])

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words= stopword, token_pattern = r"(?u)c\+{2}|\b\w+\b")
features = tfidf.fit_transform(df['Event Names'].values)
labels = df['Domains+Events']


In [37]:

for product, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    uni = [v for v in feature_names if len(v.split(' ')) == 1]
    bi = [v for v in feature_names if len(v.split(' ')) == 2]
    #print("# '{}':".format(Product))
    #print("  . Most correlated unigrams:\n. {}".format('\n. '.join(uni[-2:])))
    #print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bi[-2:])))

In [38]:
from sklearn.svm import LinearSVC
svc = LinearSVC()
xtrain, xtest, ytrain, ytest, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.01, random_state=0)

In [39]:
svc.fit(xtrain, ytrain)
y_pred = svc.predict(xtest)

In [40]:
svc = LinearSVC()

svc.fit(features, labels)

LinearSVC()

In [41]:
N = 2
for Product, category_id in sorted(category_to_id.items()):
    indices = np.argsort(svc.coef_[category_id])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    uni = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:2]
    bi = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:2]
    

In [42]:
def query_ans(event_dom, event_type, employees):
    return employees.query("Domain == '" + event_dom + "' and (Event1 == '" + event_type +"' or Event2 == '" + event_type + "')")


In [43]:
def predict(inp_eve, employees):
        recommendations = []
        prediction = svc.predict(tfidf.transform(inp_eve))  
        for text, predicted in zip(inp_eve, prediction):
            print('"{}"'.format(text))
            print("  - Predicted as: '{}'".format(predicted))
            print("")
        for predictions in prediction.tolist():
            domain, event_type = predictions.split(".")
            if domain == 'Artificial_Intelligence':
                recommend_to = query_ans('Artificial Intelligence', event_type, employees)
                
            elif domain == 'WebDev':
                recommend_to = query_ans('Web Development', event_type, employees)
                
            elif domain == 'Mobile_Applications':
                recommend_to = query_ans('Mobile Applications', event_type, employees)
                
            elif domain == 'ML':
                recommend_to = query_ans('Machine Learning', event_type, employees)
                
            elif domain == 'CC':
                recommend_to = query_ans('Cloud Computing', event_type, employees)
                
            elif domain == 'Higher_Education':
                recommend_to = query_ans('Higher Education', event_type, employees)
                
            elif domain == 'DevOps':
                recommend_to = query_ans('Development Processes', event_type, employees)
                
            elif domain == 'Software_Architecture':
                recommend_to = query_ans('Software Architecture', event_type, employees)
                
            elif domain == 'Data_Science':
                recommend_to = query_ans('Data Science', event_type, employees)
                
            elif domain == 'Cpp':
                recommend_to = query_ans('C++', event_type, employees)
                
            elif domain == 'None':
                recommend_to = employees.query("Event1 == '" + event_type + "' or Event2 == '" + event_type + "'")
                
            else:
                recommend_to = query_ans(domain, event_type, employees)
                
            recommendations.append(", ".join(recommend_to['Name'].values))
            
        return recommendations


In [44]:
def make_predictions():
    path=input("Enter name of input file")
    employees = pd.read_csv("CCMLEmployeeData.csv")
    to_pred_events = pd.read_csv(path, encoding= 'unicode_escape')
    recommendations = predict(to_pred_events.Events, employees)
    to_pred_events['Employees'] = recommendations
    to_pred_events.to_excel('output.xlsx', index=False)

In [48]:
make_predictions()

Enter name of input fileInputs_for_recommendation.csv
"Learn Java Basics  and get Certification for Free!!"
  - Predicted as: 'Java.Webinars'

"Job openings for C++ Developer at Amazon"
  - Predicted as: 'Cpp.Jobs'

"Machine learning is new technology attend the webinar now!"
  - Predicted as: 'Artificial_Intelligence.Webinars'

"Workshop on Financial markets this weekend!Hurry up!!"
  - Predicted as: 'Finance.Webinars'

"Two days Machine Learning Crash Course by MLI"
  - Predicted as: 'ML.Trainings'

"Talks on Job Oppurtunities in Data Science"
  - Predicted as: 'Data_Science.Talks'

"Mobile World Conference 2020. Registrations open soon!"
  - Predicted as: 'Mobile_Applications.Seminars'

"Apply for these Data Scientist positions at Google"
  - Predicted as: 'Data_Science.Jobs'

"Plan your Higher Studies with this Seminar.Hurry up Few Seats Available!!"
  - Predicted as: 'Higher_Education.Seminars'

"Workshops for Higher Education of students"
  - Predicted as: 'Higher_Education.Expos

In [51]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split

df = pd.read_excel("train_data.xlsx",usecols= [0,1])
target_name = df['Domains+Events'].unique().tolist()
df['Id'] = df['Domains+Events'].factorize()[0]


category_id_df = df[['Domains+Events', 'Id']].drop_duplicates().sort_values('Id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['Id', 'Domains+Events']].values)


stopword = text.ENGLISH_STOP_WORDS.difference(["AI", "ai"])

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words= stopword, token_pattern = r"(?u)c\+{2}|\b\w+\b")
features = tfidf.fit_transform(df['Event Names'].values)
labels = df['Domains+Events']

for product, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    uni = [v for v in feature_names if len(v.split(' ')) == 1]
    bi = [v for v in feature_names if len(v.split(' ')) == 2]
    #print("# '{}':".format(Product))
    #print("  . Most correlated unigrams:\n. {}".format('\n. '.join(uni[-2:])))
    #print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bi[-2:])))

from sklearn.svm import LinearSVC
svc = LinearSVC()
xtrain, xtest, ytrain, ytest, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.01, random_state=0)


svc.fit(xtrain, ytrain)
y_pred = svc.predict(xtest)

svc = LinearSVC()
svc.fit(features, labels)



for Product, category_id in sorted(category_to_id.items()):
    indices = np.argsort(svc.coef_[category_id])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    uni = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:2]
    bi = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:2]
    

def query_ans(event_dom, event_type, employees):
    return employees.query("Domain == '" + event_dom + "' and (Event1 == '" + event_type +"' or Event2 == '" + event_type + "')")


def predict(inp_eve, employees):
        recommendations = []
        prediction = svc.predict(tfidf.transform(inp_eve))  
        for text, predicted in zip(inp_eve, prediction):
            print('"{}"'.format(text))
            print("  - Predicted as: '{}'".format(predicted))
            print("")
        for predictions in prediction.tolist():
            domain, event_type = predictions.split(".")
            if domain == 'Artificial_Intelligence':
                recommend_to = query_ans('Artificial Intelligence', event_type, employees)

            elif domain == 'Data_Science':
                recommend_to = query_ans('Data Science', event_type, employees)

            elif domain == 'CC':
                recommend_to = query_ans('Cloud Computing', event_type, employees)

            elif domain == 'WebDev':
                recommend_to = query_ans('Web Development', event_type, employees)
                
            elif domain == 'Mobile_Applications':
                recommend_to = query_ans('Mobile Applications', event_type, employees)

            elif domain == 'Software_Architecture':
                recommend_to = query_ans('Software Architecture', event_type, employees)

            elif domain == 'ML':
                recommend_to = query_ans('Machine Learning', event_type, employees)
                
            elif domain == 'Higher_Education':
                recommend_to = query_ans('Higher Education', event_type, employees)
                
            elif domain == 'DevOps':
                recommend_to = query_ans('Development Processes', event_type, employees)
                      
            elif domain == 'Cpp':
                recommend_to = query_ans('C++', event_type, employees)
                
            elif domain == 'None':
                recommend_to = employees.query("Event1 == '" + event_type + "' or Event2 == '" + event_type + "'")
                
            else:
                recommend_to = query_ans(domain, event_type, employees)
                
            recommendations.append(", ".join(recommend_to['Name'].values))
            
        return recommendations

def create_excel():
    path=input("Enter name of input file")
    employees = pd.read_csv("CCMLEmployeeData.csv")
    to_pred_events = pd.read_csv(path, encoding= 'unicode_escape')
    recommendations = predict(to_pred_events.Events, employees)
    to_pred_events['Employees'] = recommendations
    to_pred_events.to_excel('result.xlsx', index=False)
create_excel()

Enter name of input fileInputs_for_recommendation.csv
"Learn Java Basics  and get Certification for Free!!"
  - Predicted as: 'Java.Webinars'

"Job openings for C++ Developer at Amazon"
  - Predicted as: 'Cpp.Jobs'

"Machine learning is new technology attend the webinar now!"
  - Predicted as: 'Artificial_Intelligence.Webinars'

"Workshop on Financial markets this weekend!Hurry up!!"
  - Predicted as: 'Finance.Webinars'

"Two days Machine Learning Crash Course by MLI"
  - Predicted as: 'ML.Trainings'

"Talks on Job Oppurtunities in Data Science"
  - Predicted as: 'Data_Science.Talks'

"Mobile World Conference 2020. Registrations open soon!"
  - Predicted as: 'Mobile_Applications.Seminars'

"Apply for these Data Scientist positions at Google"
  - Predicted as: 'Data_Science.Jobs'

"Plan your Higher Studies with this Seminar.Hurry up Few Seats Available!!"
  - Predicted as: 'Higher_Education.Seminars'

"Workshops for Higher Education of students"
  - Predicted as: 'Higher_Education.Expos