## 1. Importing Libs

In [10]:
import logging
import pandas as pd
import boto3
from botocore.exceptions import ClientError
from boto3.dynamodb.conditions import Key
from decimal import Decimal
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout, LSTM
from tensorflow.keras.optimizers import Adam
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# Constants
TABLE_NAME = 'twitter-analytics-v2'
RATING_FILE_PATH = 'database/ratingv2.csv'
COLUMN_TYPES = {
    'item_id': 'str',
    'user_who_published': 'str',
    'user_id': 'Int32',
    'ranking': 'Int32',
    'rating': 'Int32',
    'algorithm': 'str',
    'date': 'str'
}

# Setup logging
logging.basicConfig(level=logging.INFO)

# Initialize resources
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table(TABLE_NAME)

In [2]:
def get_tweets_from_user(user_id):
    """Get tweets published by a user."""
    try:
        response = table.query(
            KeyConditionExpression=Key('PK').eq(f'Tweet#AuthorId#{user_id}'))
    except ClientError as e:
        logging.error(f"Failed to get tweets from user {user_id}: {e}")
    else:
        return response['Items']

In [3]:
def get_tweet(user_id, tweet_id):
    """Get a specific tweet published by a user."""
    try:
        response = table.get_item(Key={'PK': f'Tweet#AuthorId#{user_id}', 'SK': f'TweetId#{tweet_id}'})
    except ClientError as e:
        logging.error(f"Failed to get tweet {tweet_id} from user {user_id}: {e}")
    else:
        return response.get('Item', {})

In [4]:
def read_recommendations(file_path):
    """Read recommendations from a CSV file."""
    return pd.read_csv(file_path, dtype=COLUMN_TYPES)

## 2. Load unique tweets from the dataset of recommendations

In [16]:
def get_tweets_data_from_proposal_method(recommendations):
    """Enhance recommendations with additional tweet data."""
    for col in ['score', 'text', 'like_count', 'retweet_count', 'reply_count', 'quote_count']:
        if col not in recommendations.columns:
            recommendations[col] = None 

    unique_items = recommendations[['item_id', 'user_who_published']].drop_duplicates()

    for index, row in unique_items.iterrows():
        tweet = get_tweet(str(row['user_who_published']), str(row['item_id']))

        if tweet:
            unique_items.at[index, 'user_id'] = recommendations.iloc[index]['user_id']
            unique_items.at[index, 'score'] = Decimal(tweet.get('SocialCapitalScore', 0))
            unique_items.at[index, 'text'] = tweet.get('Text', '')
            unique_items.at[index, 'like_count'] = Decimal(tweet.get('LikeCount', 0))
            unique_items.at[index, 'retweet_count'] = Decimal(tweet.get('RetweetCount', 0))
            unique_items.at[index, 'reply_count'] = Decimal(tweet.get('ReplyCount', 0))
            unique_items.at[index, 'quote_count'] = Decimal(tweet.get('QuoteCount', 0))
    return unique_items

In [19]:
def create_new_recommendations(recommendations, unique_items):
    newRecommendations = pd.DataFrame()
    for index, row in recommendations.iterrows():
        user_id = row['user_id']
        who_published = str(row['user_who_published'])
        tweet_id = str(row['item_id'])
        algorithm = row['algorithm']

        rating = recommendations[(recommendations['item_id'] == tweet_id) & (recommendations['algorithm'] == algorithm) & (recommendations['user_id'] == user_id)]['rating'].values[0]

        item = unique_items[unique_items['item_id'] == tweet_id]
        score = item['score'].values[0]
        text = item['text'].values[0]
        newRecommendations.at[index, 'text'] = text
        newRecommendations.at[index, 'item_id'] = tweet_id
        newRecommendations.at[index, 'user_who_published'] = who_published
        newRecommendations.at[index, 'user_id'] = user_id
        newRecommendations.at[index, 'ranking'] = 0
        newRecommendations.at[index, 'rating'] = int(rating)
        newRecommendations.at[index, 'score'] = score
        newRecommendations.at[index, 'like_count'] = item['like_count'].values[0]
        newRecommendations.at[index, 'retweet_count'] = item['retweet_count'].values[0]
        newRecommendations.at[index, 'reply_count'] = item['reply_count'].values[0]
        newRecommendations.at[index, 'quote_count'] = item['quote_count'].values[0]
        newRecommendations.at[index, 'algorithm'] = f'{algorithm}-SCSA_PLUS'
        newRecommendations.at[index, 'date'] = "2023-08-19"

    ranking = 1
    sortRecommendationsByUserAndAlgorithm = newRecommendations.sort_values(by=['user_id','algorithm', 'score'], ascending=False)

    for index, row in sortRecommendationsByUserAndAlgorithm.iterrows():
        sortRecommendationsByUserAndAlgorithm.at[index, 'ranking'] = ranking
        if(ranking == 10):
            ranking = 1
        else: 
            ranking = ranking + 1

    print(sortRecommendationsByUserAndAlgorithm.head())
    sortRecommendationsByUserAndAlgorithm.drop('score', axis=1, inplace=True)

    #return pd.concat([recommendations, sortRecommendationsByUserAndAlgorithm], ignore_index=True)

    return sortRecommendationsByUserAndAlgorithm

## 2. Load Your Data

In [None]:
base_recommendations_df = read_recommendations(RATING_FILE_PATH)
logging.info(base_recommendations_df.head())

proposed_recommendations = get_tweets_data_from_proposal_method(base_recommendations_df)

In [None]:
recommendations = create_new_recommendations(base_recommendations_df, proposed_recommendations)
recommendations.to_csv('output_recommendations.csv', index=False)

## 3. Text Processing with TF-IDF

In [21]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=1000)  # You can adjust the number of features

# Fit and transform the 'tweet_text' column
tfidf_matrix = tfidf.fit_transform(recommendations['text'].fillna(''))

# Convert the TF-IDF matrix to a DataFrame to make it easier to work with
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Optionally, prefix the column names to distinguish text features
tfidf_df.columns = ['text_' + str(col) for col in tfidf_df.columns]

## 4. Normalize Interaction Data

In [22]:
# Select interaction columns
interaction_data = recommendations[['like_count', 'retweet_count', 'quote_count', 'reply_count']].fillna(0)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Scale the interaction data
scaled_interaction_data = scaler.fit_transform(interaction_data)

# Convert the scaled data into a DataFrame
interaction_df = pd.DataFrame(scaled_interaction_data, columns=interaction_data.columns)

# Optionally, prefix the column names to distinguish interaction features
interaction_df.columns = ['interaction_' + str(col) for col in interaction_df.columns]

## 5. Combine Features

In [23]:
# Combine the TF-IDF features and the scaled interaction features
combined_features = pd.concat([tfidf_df, interaction_df], axis=1)

# If you have additional features like user or tweet IDs, you can include them as well
combined_features['user_id'] = recommendations['user_id']
combined_features['tweet_id'] = recommendations['item_id']

## 6. Encode Categorical Data

In [24]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoders
user_id_encoder = LabelEncoder()
tweet_id_encoder = LabelEncoder()
algorithm_encoder = LabelEncoder()

# Fit and transform the categorical columns
combined_features['user_id_encoded'] = user_id_encoder.fit_transform(combined_features['user_id'])
combined_features['tweet_id_encoded'] = tweet_id_encoder.fit_transform(combined_features['tweet_id'])
combined_features['algorithm_encoded'] = algorithm_encoder.fit_transform(recommendations['algorithm'])

## 7. Prepare for Modeling

In [25]:
# Define your target variable (assuming 'rating' is the column with your target variable)
target = recommendations['rating'].astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, target, test_size=0.2, random_state=42)

# Convert pandas DataFrame to float32
X_train = X_train.astype('float32')
y_train = y_train.astype('float32')

# If using NumPy, ensure the arrays are the correct type
y_train = np.array(y_train).astype('float32')

# Replace inf with a large finite number and nan with zero
X_train = np.nan_to_num(X_train)
y_train = np.nan_to_num(y_train)

# Example of reshaping if your model expects one feature
# y_train might need to be reshaped if it's a single feature
y_train = y_train.reshape(-1, 1)  # Reshape if necessary


## 8. Define the Neural Network Architecture

In [26]:
# Text and interaction features input
text_interaction_input = Input(shape=(X_train.shape[1],), name="text_interaction_input")

# User and tweet embeddings
user_input = Input(shape=(1,), name="user_input")
user_embedding = Embedding(input_dim=len(user_id_encoder.classes_), output_dim=50)(user_input)
user_embedding = Flatten()(user_embedding)

tweet_input = Input(shape=(1,), name="tweet_input")
tweet_embedding = Embedding(input_dim=len(tweet_id_encoder.classes_), output_dim=50)(tweet_input)
tweet_embedding = Flatten()(tweet_embedding)

# Algorithm input
algorithm_input = Input(shape=(1,), name="algorithm_input")
algorithm_embedding = Embedding(input_dim=len(algorithm_encoder.classes_), output_dim=10)(algorithm_input)
algorithm_embedding = Flatten()(algorithm_embedding)

# Combine all inputs
concat_layer = Concatenate()([text_interaction_input, user_embedding, tweet_embedding, algorithm_embedding])

# Deep Neural Network layers
dnn_layer = Dense(256, activation='relu')(concat_layer)
dnn_layer = Dropout(0.2)(dnn_layer)
tf = Dense(128, activation='relu')(dnn_layer)
output = Dense(1)(dnn_layer)

# Define model
model = Model(inputs=[text_interaction_input, user_input, tweet_input, algorithm_input], outputs=output)

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')



## 9. Train the Model

In [27]:
# Prepare inputs
train_inputs = {
    "text_interaction_input": X_train.drop(['user_id_encoded', 'tweet_id_encoded', 'algorithm_encoded'], axis=1),
    "user_input": X_train['user_id_encoded'],
    "tweet_input": X_train['tweet_id_encoded'],
    "algorithm_input": X_train['algorithm_encoded']
}

test_inputs = {
    "text_interaction_input": X_test.drop(['user_id_encoded', 'tweet_id_encoded', 'algorithm_encoded'], axis=1),
    "user_input": X_test['user_id_encoded'],
    "tweet_input": X_test['tweet_id_encoded'],
    "algorithm_input": X_test['algorithm_encoded']
}

# Train the model
model.fit(train_inputs, y_train, validation_split=0.1, epochs=10, batch_size=32)

AttributeError: 'numpy.ndarray' object has no attribute 'drop'

## 10. Predict and Evaluate

In [None]:
# Predict ratings
predicted_ratings = model.predict(test_inputs)

# Evaluate the model (you can use more sophisticated metrics as needed)
mse = tf.keras.losses.MeanSquaredError()
error = mse(y_test, predicted_ratings).numpy()
print(f"Mean Squared Error on Test Set: {error}")