In [1]:
import pandas as pd

In [2]:
test_data_1 = pd.read_excel('/kaggle/input/test-data-adobe/behaviour_simulation_test_company.xlsx',index_col='id') 
test_data_2 = pd.read_excel('/kaggle/input/test-data-adobe/behaviour_simulation_test_time.xlsx',index_col='id') 

In [3]:
import pandas as pd
import numpy as np
import requests
import os
import gc
from tqdm import tqdm
from urllib.parse import urlparse
import torch
import torchvision.transforms as transforms
from PIL import Image
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights
from multiprocessing import Pool
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_embedding_model():
    model = efficientnet_b0(weights=EfficientNet_B0_Weights.DEFAULT)
    # features = list(model.children())[:-1]
    # embedding_size = 128
    # features.extend([
    #     torch.nn.AdaptiveAvgPool2d(1),
    #     torch.nn.Flatten(),
    #     torch.nn.Linear(1280, embedding_size)
    # ])
    # embedding_model = torch.nn.Sequential(*features).to(device)
    # embedding_model.eval()
    model.to(device)
    model.eval()
    return model

transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def extract_url(media_string):
    if pd.isna(media_string):
        return None
    if 'Photo' in media_string:
        return media_string.split("previewUrl='")[1].split("'")[0]
    else:
        return media_string.split("thumbnailUrl='")[1].split("'")[0]
    return None

def download_media(url, save_dir):
    try:
        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            file_name = os.path.basename(urlparse(url).path)
            file_path = os.path.join(save_dir, file_name)
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            return file_path
        else:
            return None
    except requests.RequestException:
        return None

def download_media_parallel(urls, save_dir):
    with Pool() as pool:
        return pool.starmap(download_media, [(url, save_dir) for url in urls if url])

def get_batch_embeddings(img_paths, model, batch_size=32):
    embeddings = []
    for i in range(0, len(img_paths), batch_size):
        batch_paths = img_paths[i:i+batch_size]
        batch_imgs = [Image.open(path, mode='r', formats=['JPEG', 'PNG']).convert('RGB') for path in batch_paths if isinstance(path, str) and os.path.exists(path)]
        if batch_imgs:
            batch_tensors = torch.stack([transform(img) for img in batch_imgs]).to(device)
            with torch.no_grad():
                batch_embeddings = model(batch_tensors).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return embeddings

def get_bertweet_embeddings(texts, model, tokenizer, max_length=128, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating BERTweet embeddings"):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token
        embeddings.extend(batch_embeddings)
        del inputs, outputs, batch_embeddings
        torch.cuda.empty_cache()
    return np.array(embeddings)

def preprocessing(df, save_dir='/kaggle/working/adobe/'):
    # Date feature extraction
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].dt.month
    df['is_weekend'] = (df['date'].dt.dayofweek>=5)*1
    df['is_month_start'] = df['date'].dt.is_month_start*1
    df['is_month_end'] = df['date'].dt.is_month_end*1
    df['day'] = df['date'].dt.day
    df = df.drop('date', axis=1)
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # Image processing
    df['media_url'] = df['media'].apply(extract_url)
    df['local_path'] = download_media_parallel(df['media_url'].dropna().tolist(), save_dir)
    df['local_path'] = df['local_path'].apply(lambda x: x if x and os.path.exists(x) else None)
    valid_paths = df['local_path'].dropna().tolist()
    
    embedding_model = get_embedding_model()
    image_embeddings = get_batch_embeddings(valid_paths, embedding_model, batch_size=16)
    image_embedding_df = pd.DataFrame(image_embeddings, columns=[f'image_embedding_{i}' for i in range(1000)])
    df = pd.concat([df.reset_index(drop=True), image_embedding_df], axis=1)
    
    # Text processing
    df['combined_text'] = 'username : ' + df['username'] + ' '+ 'comapny : ' + df['inferred company']+'content : '+df['content'] 
    
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=True)
    model = AutoModel.from_pretrained("vinai/bertweet-base").to(device)
    model.eval()
    
    bertweet_embeddings = get_bertweet_embeddings(df['combined_text'].tolist(), model, tokenizer, batch_size=32)
    bertweet_embedding_df = pd.DataFrame(bertweet_embeddings, columns=[f'text_embedding_{i}' for i in range(768)])
    df = pd.concat([df, bertweet_embedding_df], axis=1)
    
    # Drop unnecessary columns
    df = df.drop(['media', 'media_url', 'local_path', 'content', 'username', 'inferred company', 'combined_text'], axis=1)
    
    gc.collect()
    torch.cuda.empty_cache()
    
    return df

In [4]:
import warnings
warnings.filterwarnings('ignore')
test_data_1_preprocessed = preprocessing(test_data_1)
test_data_2_preprocessed = preprocessing(test_data_2)

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 194MB/s]


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Generating BERTweet embeddings: 100%|██████████| 313/313 [00:41<00:00,  7.45it/s]
Generating BERTweet embeddings: 100%|██████████| 313/313 [00:42<00:00,  7.41it/s]


In [5]:
test_data_1_preprocessed

Unnamed: 0,month,is_weekend,is_month_start,is_month_end,day,image_embedding_0,image_embedding_1,image_embedding_2,image_embedding_3,image_embedding_4,...,text_embedding_758,text_embedding_759,text_embedding_760,text_embedding_761,text_embedding_762,text_embedding_763,text_embedding_764,text_embedding_765,text_embedding_766,text_embedding_767
0,1,0,0,0,8,-1.243495,0.578566,-1.127594,-1.031803,-0.365271,...,0.082592,0.023944,-0.006622,0.113206,0.072232,-0.116173,0.086840,-0.024582,-0.167198,-0.138928
1,1,0,0,0,29,-0.149611,-1.672527,-0.855305,-1.408817,-0.125103,...,0.054884,-0.048752,0.074601,0.124889,0.005540,-0.044088,0.013322,-0.020754,-0.437311,-0.257576
2,9,0,0,0,5,0.736935,0.611419,-0.089314,-1.636753,-0.478826,...,0.052638,0.024041,0.034373,0.077102,0.024527,-0.137520,-0.048156,-0.065898,-0.023241,-0.129378
3,3,0,0,0,28,2.735143,1.654275,0.514157,1.020851,1.387458,...,0.168835,0.098010,-0.010389,-0.018441,0.139027,-0.165451,0.000101,-0.171490,-0.239613,-0.184539
4,9,0,0,0,11,-1.896783,-0.736010,0.177177,-0.537971,0.195469,...,0.051267,0.148504,-0.006104,0.033023,0.014897,-0.048310,0.202236,-0.094700,-0.101467,-0.071769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,1,0,0,25,,,,,,...,-0.013618,-0.084858,0.099641,-0.150586,0.244001,0.028948,0.237411,-0.112068,-0.070366,0.058616
9996,6,0,0,0,26,,,,,,...,-0.097790,0.177129,0.241268,0.192368,0.094645,-0.152168,-0.159041,-0.150842,-0.075497,-0.124584
9997,5,0,0,0,6,,,,,,...,0.070087,0.025117,0.014250,0.085853,0.130755,-0.199604,0.135042,0.070040,-0.076886,-0.221017
9998,5,1,0,0,19,,,,,,...,0.151249,0.123878,0.033495,-0.136623,0.087724,-0.115950,0.064387,-0.096470,-0.106854,-0.096425


In [6]:
import warnings
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split  # Import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

df = test_data_1_preprocessed
X = df[list(test_data_1_preprocessed.columns)[:]]
X = X.fillna(0)

# Convert datetime columns to numeric
for col in X.select_dtypes(include=['datetime64']).columns:
    X[col] = X[col].astype(int) // 10**9  # Convert to Unix timestamp


# Function to create neural network
def create_nn_model(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_dim=input_dim),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

# Train-test split instead of GroupKFold

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
import joblib
from tensorflow.keras.models import load_model

# Load LightGBM model
lgb_model = joblib.load('/kaggle/input/pretrained-model/lightgbm_model (1).pkl')

# Load Neural Network model
from keras.models import load_model
from keras.losses import mean_squared_error

# Pass the mse function explicitly
nn_model = load_model('/kaggle/input/pretrained-model/neural_network_model (1).h5', 
                      custom_objects={'mse': mean_squared_error})
# nn_model = load_model('/kaggle/input/pretrained-model/neural_network_model (1).h5')

# Now you can use both models for inference
y_pred_lgb = lgb_model.predict(X_scaled)
y_pred_lgb =np.abs(y_pred_lgb)
y_pred_nn = nn_model.predict(X_scaled).flatten()

I0000 00:00:1728747856.585161      86 service.cc:145] XLA service 0x7b11b0007380 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1728747856.585210      86 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


[1m125/313[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step

I0000 00:00:1728747857.404478      86 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [7]:
y_pred_1 = np.mean([y_pred_lgb,y_pred_nn],axis=0)
sub1 = pd.DataFrame({})
sub1['likes_predicted'] = y_pred_1

In [8]:
import warnings
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split  # Import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

df = test_data_2_preprocessed
X = df[list(test_data_2_preprocessed.columns)[:]]
X = X.fillna(0)

# Convert datetime columns to numeric
for col in X.select_dtypes(include=['datetime64']).columns:
    X[col] = X[col].astype(int) // 10**9  # Convert to Unix timestamp


# Function to create neural network
def create_nn_model(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_dim=input_dim),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

# Train-test split instead of GroupKFold

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
import joblib
from tensorflow.keras.models import load_model

# Load LightGBM model
lgb_model = joblib.load('/kaggle/input/pretrained-model/lightgbm_model (1).pkl')

# Load Neural Network model
from keras.models import load_model
from keras.losses import mean_squared_error

# Pass the mse function explicitly
nn_model = load_model('/kaggle/input/pretrained-model/neural_network_model (1).h5', 
                      custom_objects={'mse': mean_squared_error})
# nn_model = load_model('/kaggle/input/pretrained-model/neural_network_model (1).h5')

# Now you can use both models for inference
y_pred_lgb = lgb_model.predict(X_scaled)
y_pred_lgb =np.abs(y_pred_lgb)
y_pred_nn = nn_model.predict(X_scaled).flatten()

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [9]:
y_pred_2 = np.mean([y_pred_lgb,y_pred_nn],axis=0)
sub2 = pd.DataFrame({})
sub2['likes_predicted'] = y_pred_2

In [10]:
out1 = sub1.to_csv('test_company.csv')
out2 = sub2.to_csv('test_time.csv')