In [3]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [19]:
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")


In [21]:

# Load the datasets
def load_data(g_path, s_train_path, submission_path):
    G = pd.read_csv(g_path, delimiter='|', encoding='utf-8')
    STrain = pd.read_csv(s_train_path, delimiter='|', encoding='utf-8')
    sample_submission = pd.read_csv(submission_path, delimiter='|', encoding='utf-8')
    return G, STrain, sample_submission



In [23]:
G, STrain, sample_submission = load_data("G.csv", "STrain.csv", "sample_submission.csv")

print("G:\n", G.head())
print("STrain:\n", STrain.head())
print("Sample Submission:\n", sample_submission.head())

G:
    company_id                        name
0      634022                  PRIMCOM SA
1      324497       The David Isaacs Fund
2      280848  Bramor Enterprises Limited
3      432662                NAVEXIM S.A.
4      524224              Magal Group SA
STrain:
    train_index                                               name  company_id
0            0                        ATRION Immo bilien & Co. KG          -1
1            1                            MyTyme Inve stments Inc      356624
2            2                                     Financial USI.      510805
3            3  FlexShares Trust - FlexShares Morningstar Emer...      523467
4            4                                    Health Sinai SF      231108
Sample Submission:
    test_index  company_id
0           0      175199
1           1      356621
2           2      191063
3           3          -1
4           4          -1


In [25]:
# Preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]', '', text)  # Remove special characters
    text = text.strip()
    return text


In [27]:

# Feature engineering: String similarity functions
def compute_string_similarities(s_name, g_name):
    return {
        'levenshtein': fuzz.ratio(s_name, g_name),
        'jaccard': len(set(s_name).intersection(set(g_name))) / len(set(s_name).union(set(g_name))),
        'tfidf_cosine': 0  # Placeholder, computed later
    }



In [29]:
# Main function to create features
def generate_features(S, G):
    vectorizer = TfidfVectorizer()
    tfidf_g_matrix = vectorizer.fit_transform(G['name'].apply(clean_text))
    
    features = []
    for _, row in S.iterrows():
        s_clean = clean_text(row['name'])
        s_vector = vectorizer.transform([s_clean])
        similarities = []
        
        for _, g_row in G.iterrows():
            g_clean = clean_text(g_row['name'])
            string_sims = compute_string_similarities(s_clean, g_clean)
            string_sims['tfidf_cosine'] = cosine_similarity(s_vector, tfidf_g_matrix[G.index == g_row.name]).flatten()[0]
            string_sims['company_id'] = g_row['company_id']
            similarities.append(string_sims)
        
        best_match = max(similarities, key=lambda x: x['tfidf_cosine'])
        best_match['train_index'] = row['train_index']
        features.append(best_match)
    
    return pd.DataFrame(features)


In [None]:
train_features = generate_features(STrain, G)
print("Generated Features:\n", train_features.head())

In [None]:

# Training model
def train_model(features, labels):
    X = features.drop(columns=['company_id', 'train_index'])
    y = labels
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    return model, le


In [None]:
model, label_encoder = train_model(train_features, STrain['company_id'])

In [None]:

# Inference function
def predict(model, le, test_features):
    X_test = test_features.drop(columns=['train_index'])
    y_pred = model.predict(X_test)
    test_features['company_id'] = le.inverse_transform(y_pred)
    return test_features[['train_index', 'company_id']]


In [None]:
test_features = generate_features(STest, G)

In [None]:
predictions = predict(model, label_encoder, test_features)
print("Predictions:\n", predictions)

In [None]:

# Save output
def save_predictions(predictions, output_path):
    predictions.to_csv(output_path, sep='|', index=False, encoding='utf-8')


In [None]:
save_predictions(predictions, "final_predictions.csv")
print("Predictions saved to final_predictions.csv")