Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1510 lines (1163 sloc)
35.2 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /*********************************************************************************** | |
| * ________________ * | |
| * || Misc. funs || * | |
| * ---------------- * | |
| * * | |
| * Author: Laurent R. Berge * | |
| * * | |
| * Three groups of non-parallel functions: * | |
| * 1) simple functions that are faster than base R (because more specific) * | |
| * 2) function to obtain the FE coefficients after the estimation is done * | |
| * 3) functions to lag variables * | |
| * * | |
| * Nothing much to say, everything is pretty explicit. * | |
| * * | |
| **********************************************************************************/ | |
| #include <Rcpp.h> | |
| #include <math.h> | |
| #include <vector> | |
| #include <stdio.h> | |
| #include <cmath> | |
| #include <functional> | |
| using namespace Rcpp; | |
| using namespace std; | |
| // [[Rcpp::plugins(cpp11)]] | |
| // [[Rcpp::export]] | |
| NumericVector cpp_lgamma(NumericVector x){ | |
| // simple function to compute lgamma of a vector | |
| int n = x.length(); | |
| NumericVector res(n); | |
| for(int i=0 ; i<n ; i++){ | |
| res[i] = lgamma(x[i]); | |
| } | |
| return(res); | |
| } | |
| // [[Rcpp::export]] | |
| NumericVector cpp_log_a_exp(double a, NumericVector mu, NumericVector exp_mu){ | |
| // faster this way | |
| int n = mu.length(); | |
| NumericVector res(n); | |
| for(int i=0 ; i<n ; i++){ | |
| if(mu[i] < 200){ | |
| res[i] = log(a + exp_mu[i]); | |
| } else { | |
| res[i] = mu[i]; | |
| } | |
| } | |
| return(res); | |
| } | |
| // [[Rcpp::export]] | |
| NumericVector cpp_partialDerivative_other(int iterMax, int Q, int N, double epsDeriv, NumericVector ll_d2, NumericVector dx_dother, NumericVector init, IntegerMatrix dumMat, IntegerVector nbCluster){ | |
| // takes in: | |
| // dumMat: the matrix of dummies (n X c) each obs => cluster // must be in cpp index!!! | |
| // init: the initialisation of the sum of derivatives vector | |
| // ll_d2: the second derivative | |
| // dx_dother: the vector of dx_dother | |
| int iter; | |
| int i, q, c; | |
| int index; | |
| int sum_cases=0; | |
| bool ok; | |
| double new_value; | |
| IntegerVector start(Q), end(Q); | |
| for(q=0 ; q<Q ; q++){ | |
| // the total number of clusters (eg if man/woman and 10 countries: total of 12 cases) | |
| sum_cases += nbCluster(q); | |
| if(q == 0){ | |
| start(q) = 0; | |
| end(q) = nbCluster(q); | |
| } else { | |
| start(q) = start(q-1) + nbCluster(q-1); | |
| end(q) = end(q-1) + nbCluster(q); | |
| } | |
| } | |
| NumericVector clusterDeriv(sum_cases); // the derivatives per cluster, stacked in one vector | |
| NumericVector sum_lld2(sum_cases); | |
| // Creation of the sum_lld2 | |
| for(i=0 ; i<N ; i++){ | |
| for(q=0 ; q<Q ; q++){ | |
| index = start[q] + dumMat(i, q); | |
| sum_lld2[index] += ll_d2(i); | |
| } | |
| } | |
| // the result to return | |
| NumericVector S(N); | |
| for(i=0 ; i<N ; i++){ | |
| S[i] = init(i); | |
| } | |
| ok = true; | |
| iter = 0; | |
| while( ok & (iter<iterMax) ){ | |
| iter++; | |
| ok = false; | |
| for(q=0 ; q<Q ; q++){ | |
| R_CheckUserInterrupt(); | |
| // init of the vector | |
| for(c=start[q] ; c<end[q] ; c++){ | |
| clusterDeriv(c) = 0; | |
| } | |
| for(i=0 ; i<N ; i++){ | |
| index = start[q] + dumMat(i, q); | |
| clusterDeriv(index) += dx_dother(i) + S(i)*ll_d2(i); | |
| } | |
| // on finit de calculer clusterDeriv + controle | |
| for(c=start[q] ; c<end[q] ; c++){ | |
| new_value = -clusterDeriv(c)/sum_lld2[c]; | |
| clusterDeriv(c) = new_value; | |
| if(fabs(new_value) > epsDeriv){ | |
| ok = true; | |
| } | |
| } | |
| // on ajoute la derivee a S: | |
| for(i=0 ; i<N ; i++){ | |
| index = start[q] + dumMat(i, q); | |
| S[i] += clusterDeriv(index); | |
| } | |
| } | |
| } | |
| // Rprintf("other, nb iter=%i\n", iter); | |
| if(iter == iterMax){ | |
| Rprintf("[Getting cluster deriv. other] Max iterations reached (%i)\n", iterMax); | |
| } | |
| return(S); | |
| } | |
| // Function to get the conditional sum of a matrix | |
| // [[Rcpp::export]] | |
| NumericMatrix cpp_tapply_sum(int Q, NumericMatrix x, IntegerVector dum){ | |
| // Q: nber of classes | |
| // N: nber of observations | |
| // x: a matrix | |
| // dum: the N vector of clusters | |
| int N = x.nrow(); | |
| int K = x.ncol(); | |
| NumericMatrix res(Q, K); | |
| int i, q, k; | |
| for(i=0 ; i<N ; i++){ | |
| q = dum(i) - 1; // we take 1 off => different indexation in C | |
| for(k=0 ; k<K ; k++){ | |
| res(q, k) += x(i, k); | |
| } | |
| } | |
| return(res); | |
| } | |
| // Function to get the conditional sum of a vector | |
| // [[Rcpp::export]] | |
| NumericVector cpp_tapply_vsum(int Q, NumericVector x, IntegerVector dum){ | |
| // Q: nber of classes | |
| // x: a matrix | |
| // dum: the N vector of clusters | |
| int N = x.size(); | |
| NumericVector res(Q); | |
| int i, q; | |
| for(i=0 ; i<N ; i++){ | |
| q = dum(i) - 1; // we take 1 off => different indexation in C | |
| res(q) += x(i); | |
| } | |
| return(res); | |
| } | |
| // similar a table but faster | |
| // [[Rcpp::export]] | |
| NumericVector cpp_table(int Q, IntegerVector dum){ | |
| // Q: nber of classes | |
| // dum: the N vector of clusters | |
| int N = dum.size(); | |
| NumericVector res(Q); | |
| int i, q; | |
| for(i=0 ; i<N ; i++){ | |
| q = dum(i) - 1; // we take 1 off => different indexation in C | |
| res(q)++; | |
| } | |
| return(res); | |
| } | |
| // | |
| // Getting the cluster coefficients | |
| // | |
| // [[Rcpp::export]] | |
| List cpp_get_fe_gnl(int Q, int N, NumericVector sumFE, IntegerMatrix dumMat, IntegerVector cluster_sizes, IntegerVector obsCluster){ | |
| // This function returns a list of the cluster coefficients for each cluster | |
| // dumMat: the matrix of cluster ID for each observation, with cpp index style | |
| // Q, N: nber of clusters / obs | |
| // cluster_sizes: vector of the number of cases per cluster | |
| // obsCluster: the integer vector that is equal to order(dum[[g]]) | |
| // RETURN: | |
| // a list for each cluster of the cluster coefficient value | |
| // + the last element is the number of clusters that have been set as references (nb_ref) | |
| // => much optimized version May 2019 | |
| int iter = 0, iterMax = 10000; | |
| int iter_loop = 0, iterMax_loop = 10000; | |
| // Creation of the indices to put all the cluster values into a single vector | |
| int nb_coef = 0; | |
| IntegerVector nb_ref(Q); // nb_ref takes the nb of elements set as ref | |
| for(int q=0 ; q<Q ; q++){ | |
| // the total number of clusters (eg if c1: man/woman and c2: 10 countries: total of 12 cases) | |
| nb_coef += cluster_sizes(q); | |
| } | |
| NumericVector cluster_values(nb_coef); | |
| IntegerVector cluster_visited(nb_coef); // whether a value has been already assigned | |
| // index of the cluster | |
| std::vector<int*> pindex_cluster(Q); | |
| std::vector<int> index_cluster(nb_coef); | |
| for(int i=0 ; i<nb_coef ; i++){ | |
| index_cluster[i] = i; | |
| } | |
| pindex_cluster[0] = index_cluster.data(); | |
| for(int q=1 ; q<Q ; q++){ | |
| pindex_cluster[q] = pindex_cluster[q - 1] + cluster_sizes(q - 1); | |
| } | |
| // Now we create the vector of observations for each cluster | |
| // we need a strating and an end vector as well | |
| IntegerVector start_cluster(nb_coef), end_cluster(nb_coef); | |
| int index; | |
| int k; | |
| for(int q=0 ; q<Q ; q++){ | |
| // table cluster: nber of elements for each cluster class | |
| IntegerVector tableCluster(cluster_sizes(q)); | |
| for(int i=0 ; i<N ; i++){ | |
| k = dumMat(i, q); | |
| tableCluster(k) += 1; // the number of obs per case | |
| } | |
| // now creation of the start/end vectors | |
| for(int k=0 ; k<cluster_sizes(q) ; k++){ | |
| int *pindex = pindex_cluster[q]; | |
| index = pindex[k]; | |
| // index = start(q) + k; | |
| if(k == 0){ | |
| start_cluster[index] = 0; | |
| end_cluster[index] = tableCluster[k]; | |
| } else { | |
| start_cluster[index] = end_cluster[index - 1]; | |
| end_cluster[index] = end_cluster[index - 1] + tableCluster[k]; | |
| } | |
| } | |
| } | |
| // matrix of the clusters that have been computed | |
| IntegerMatrix mat_done(N, Q); | |
| IntegerVector rowsums(N, 0); | |
| // vector of elements to loop over | |
| IntegerVector id2do(N); | |
| IntegerVector id2do_next(N); | |
| int nb2do = N, nb2do_next = N; | |
| for(int i=0 ; i<nb2do ; i++){ | |
| id2do(i) = i; | |
| id2do_next(i) = i; | |
| } | |
| // Other indices and variables | |
| int qui_max, obs; | |
| int rs, rs_max; // rs: row sum | |
| int id_cluster; | |
| double other_value; | |
| bool first; | |
| // | |
| // THE MAIN LOOP | |
| // | |
| while(iter < iterMax){ | |
| iter++; | |
| // | |
| // Finding the row where to put the 0s | |
| // | |
| if(iter == 1){ | |
| // 1st iter, we select the first element | |
| qui_max = 0; | |
| } else { | |
| // we find the row that has the maximum of items done | |
| qui_max = 0; | |
| rs_max = 0; | |
| for(int i=0 ; i<nb2do ; i++){ | |
| obs = id2do[i]; | |
| rs = rowsums[obs]; | |
| if(rs == Q-2){ | |
| // if rs == Q-2 => its the maximum possible, no need to go further | |
| qui_max = obs; | |
| break; | |
| } else if(rs<Q && rs>rs_max){ | |
| // this is to handle complicated cases with more than 3+ clusters | |
| qui_max = obs; | |
| rs_max = rs; | |
| } else if(qui_max == 0 && rs == 0){ | |
| // used to initialize qui_max | |
| qui_max = obs; | |
| } | |
| } | |
| } | |
| // Rprintf("Obs selected: %i\n", qui_max); | |
| // | |
| // Putting the 0s, ie setting the references | |
| // | |
| // the first element is spared | |
| first = true; | |
| for(int q=0 ; q<Q ; q++){ | |
| if(mat_done(qui_max, q) == 0){ | |
| if(first){ | |
| // we spare the first element | |
| first = false; | |
| } else { | |
| // we set the cluster to 0 | |
| // 1) we find the cluster | |
| id_cluster = dumMat(qui_max, q); | |
| // Rprintf("Cluster: %i\n", id_cluster + 1); | |
| // 2) we get the index of the cluster vector | |
| int *pindex = pindex_cluster[q]; | |
| index = pindex[id_cluster]; | |
| // 3) we set the cluster value to 0 | |
| cluster_values(index) = 0; | |
| // 4) we update the mat_done matrix for the elements of this cluster | |
| for(int i=start_cluster[index] ; i<end_cluster[index] ; i++){ | |
| obs = obsCluster(i, q); | |
| mat_done(obs, q) = 1; | |
| rowsums[obs]++; | |
| } | |
| // 5) => we save the information on which cluster was set as a reference | |
| nb_ref(q)++; | |
| } | |
| } | |
| } | |
| // | |
| // LOOP OF ALL OTHER UPDATES (CRITICAL) | |
| // | |
| iter_loop = 0; | |
| while(iter_loop < iterMax_loop){ | |
| iter_loop++; | |
| // Rprintf("nb2do_next: %i -- nb2do: %i\n", nb2do_next, nb2do); | |
| R_CheckUserInterrupt(); | |
| // | |
| // Selection of indexes (new way) to be updated | |
| // | |
| // initialisation of the observations to cover (before first loop the two are identical) | |
| if(iter_loop != 1){ | |
| nb2do = nb2do_next; | |
| for(int i=0 ; i<nb2do ; i++){ | |
| id2do[i] = id2do_next[i]; | |
| } | |
| } | |
| nb2do_next = 0; | |
| for(int i=0 ; i<nb2do ; i++){ | |
| // we compute the rowsum of the obs that still needs to be done | |
| obs = id2do[i]; | |
| rs = rowsums[obs]; | |
| if(rs < Q-1){ | |
| // you need to check it next time! | |
| id2do_next[nb2do_next] = obs; | |
| nb2do_next++; | |
| } else if(rs == Q-1){ | |
| // means: needs to be updated | |
| int q; | |
| for(q=0 ; mat_done(obs, q)!=0 ; q++){ | |
| // selection of the q that is equal to 0 | |
| } | |
| int *pindex = pindex_cluster[q]; | |
| int index_select = pindex[dumMat(obs, q)]; | |
| // Finding the value of the cluster coefficient | |
| other_value = 0; | |
| // Computing the sum of the other cluster values | |
| // and finding the cluster to be updated (q_select) | |
| for(int l=0 ; l<Q ; l++){ | |
| // we can loop over all q because cluster_values is initialized to 0 | |
| index = pindex_cluster[l][dumMat(obs, l)]; | |
| other_value += cluster_values(index); | |
| } | |
| // the index to update | |
| cluster_values(index_select) = sumFE(obs) - other_value; | |
| // Update of the mat_done | |
| for(int j=start_cluster[index_select] ; j<end_cluster[index_select] ; j++){ | |
| obs = obsCluster(j, q); | |
| mat_done(obs, q) = 1; | |
| rowsums[obs]++; | |
| } | |
| } | |
| } | |
| if(nb2do_next == nb2do) break; | |
| } | |
| // out of this loop: | |
| // - nb2do == nb2do_next | |
| // - id2do == id2do_next | |
| // Check that everything is all right | |
| if(iter_loop == iterMax_loop){ | |
| Rprintf("Problem getting FE, maximum iterations reached (2nd order loop)."); | |
| } | |
| // if no more obs to be covered | |
| if(nb2do_next == 0) break; | |
| } | |
| if(iter == iterMax){ | |
| Rprintf("Problem getting FE, maximum iterations reached (1st order loop)."); | |
| } | |
| // final formatting and save | |
| List res(Q + 1); | |
| int *pindex; | |
| for(int q=0 ; q<Q ; q++){ | |
| NumericVector quoi(cluster_sizes(q)); | |
| pindex = pindex_cluster[q]; | |
| for(k=0 ; k<cluster_sizes(q) ; k++){ | |
| index = pindex[k]; | |
| // index = start(q) + k; | |
| quoi(k) = cluster_values(index); | |
| } | |
| res(q) = quoi; | |
| } | |
| res(Q) = nb_ref; | |
| return(res); | |
| } | |
| // [[Rcpp::export]] | |
| double cpp_ssr_null(NumericVector y, NumericVector w = NumericVector(0)){ | |
| // simple fun to compute the ssr of the null ols model | |
| // 2/3 times faster than pure r | |
| bool is_weight = w.length() > 1; | |
| int n = y.length(); | |
| double denom = 0; | |
| // the "mean" | |
| double y_mean = 0; | |
| for(int i=0 ; i<n ; ++i){ | |
| if(is_weight){ | |
| y_mean += y[i] * w[i]; | |
| denom += w[i]; | |
| } else { | |
| y_mean += y[i]; | |
| } | |
| } | |
| if(is_weight){ | |
| y_mean = y_mean/denom; | |
| } else { | |
| y_mean = y_mean/n; | |
| } | |
| double res = 0, value = 0; | |
| for(int i=0 ; i<n ; ++i){ | |
| value = y[i] - y_mean; | |
| if(is_weight){ | |
| res += value * value * w[i]; | |
| } else { | |
| res += value * value; | |
| } | |
| } | |
| return(res); | |
| } | |
| // [[Rcpp::export]] | |
| double cpp_ssq(NumericVector x, NumericVector w = NumericVector(0)){ | |
| // simple fun to compute the sum of the square of the elt of a vector | |
| // 30% faster than pure r (twice faster with weights) | |
| bool is_weight = w.length() > 1; | |
| int n = x.length(); | |
| // the mean | |
| double res = 0; | |
| for(int i=0 ; i<n ; i++){ | |
| if(is_weight){ | |
| res += x[i] * x[i] * w[i]; | |
| } else { | |
| res += x[i] * x[i]; | |
| } | |
| } | |
| return res; | |
| } | |
| // [[Rcpp::export]] | |
| bool cpp_isConstant(NumericVector x){ | |
| // simple fun to see whether a variable is constant | |
| // it is unexpensive -- not the most useful function however: | |
| // for 1e7 obs, you gain 50ms over var(x)==0, but it's still 50ms! | |
| int n = x.length(); | |
| bool res = true; | |
| double value = x[0]; | |
| for(int i=1 ; i<n ; i++){ | |
| if(x[i] != value){ | |
| res = false; | |
| break; | |
| } | |
| } | |
| return(res); | |
| } | |
| // [[Rcpp::export]] | |
| bool cpp_any_na_null(SEXP x){ | |
| // > twice faster than testing the two separately | |
| // x is a vector | |
| int n = Rf_length(x); | |
| double *px = REAL(x); | |
| for(int i=0 ; i<n ; ++i){ | |
| double x_tmp = px[i]; | |
| if(std::isnan(x_tmp) || x_tmp == 0){ | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| // [[Rcpp::export]] | |
| int cpp_constant_dum(int k, NumericVector x, IntegerVector dum, bool only_0 = false){ | |
| // number of values of dum for which x is constant | |
| int n_obs = dum.length(); | |
| double ref = x[0]; | |
| int dum_current = dum[0]; | |
| bool found_different = only_0 ? ref != 0 : false; | |
| int nb_constant = 0; | |
| for(int i=1 ; i<n_obs ; ++i){ | |
| if(dum[i] != dum_current){ | |
| // new guy | |
| dum_current = dum[i]; | |
| if(found_different == false){ | |
| ++nb_constant; | |
| } | |
| ref = x[i]; | |
| found_different = only_0 ? ref != 0 : false; | |
| } else if(!found_different){ | |
| if(x[i] != ref){ | |
| found_different = true; | |
| } | |
| } | |
| } | |
| if(found_different == false){ | |
| ++nb_constant; | |
| } | |
| return nb_constant; | |
| } | |
| // | |
| // Lag related functions // | |
| // | |
| // [[Rcpp::export]] | |
| List cpp_find_duplicates(IntegerVector id, IntegerVector time){ | |
| // we check whether there are duplicated rows | |
| // if so, we provide information | |
| int n = id.length(); | |
| int n_dup = 0; | |
| int obs_dup = 0; | |
| bool any_dup = false; | |
| /// finding the first duplicate value | |
| int i = 0; | |
| for(i=1 ; i<n ; ++i){ | |
| if(time[i - 1] == time[i]){ | |
| if(id[i - 1] == id[i]){ | |
| any_dup = true; | |
| break; | |
| } | |
| } | |
| } | |
| // if dup: we find out the nber | |
| if(any_dup){ | |
| obs_dup = i; // the 1 is implicitely added to make it r style | |
| int id_dup = id[i]; | |
| int time_dup = time[i]; | |
| n_dup = 2; | |
| while(++i<n && id_dup == id[i] && time_dup == time[i]) n_dup++; | |
| } | |
| List res; | |
| res["n_dup"] = n_dup; | |
| res["obs_dup"] = obs_dup; | |
| return(res); | |
| } | |
| // [[Rcpp::export]] | |
| int cpp_pgcd(IntegerVector x){ | |
| // quick and dirty, but does not matter | |
| int n = x.length(); | |
| if(n == 1){ | |
| return(x[0]); | |
| } | |
| bool ok = false; | |
| int pgcd = x[0]; | |
| // the min | |
| for(int i=1 ; i<n ; ++i){ | |
| if(pgcd > x[i]){ | |
| pgcd = x[i]; | |
| } | |
| } | |
| // the denom | |
| while(!ok && pgcd > 1){ | |
| ok = true; | |
| for(int i=0 ; i<n ; ++i){ | |
| if(x[i] % pgcd != 0){ | |
| pgcd--; | |
| ok = false; | |
| break; | |
| } | |
| } | |
| } | |
| return pgcd; | |
| } | |
| // [[Rcpp::export]] | |
| IntegerVector cpp_lag_obs(IntegerVector id, IntegerVector time, int nlag){ | |
| // in case of ties, we sum | |
| // must be two consecutive years | |
| // returns an observation nber of where to find the lagged obs | |
| // note that we forbid duplicate rows!!!!! => in case of duplicate we use a rule of thumb | |
| int nobs = id.length(); | |
| int obs; | |
| int id_current; | |
| int time_current; | |
| IntegerVector res(nobs, NA_INTEGER); | |
| int i, j, diff_time; | |
| if(nlag > 0){ | |
| i = 0; | |
| while(i < nobs){ | |
| // R_CheckUserInterrupt(); // this is (too) costly | |
| id_current = id[i]; | |
| time_current = time[i]; | |
| obs = i + 1; // observation, R style | |
| j = i + 1; | |
| while(j < nobs){ | |
| diff_time = time[j] - time_current; | |
| if(id[j] != id_current){ | |
| // we start someone else | |
| i = j - 1; // minus 1 because after we indent i | |
| break; | |
| } else if(diff_time > nlag){ | |
| // we are too far => stop | |
| break; | |
| } else if(diff_time == 0){ | |
| // it's the same time/id => we skip | |
| ++i; | |
| } else if(diff_time < nlag){ | |
| // not far enough | |
| } else if(diff_time == nlag){ | |
| // match! | |
| res[j] = obs; | |
| } | |
| ++j; | |
| } | |
| ++i; | |
| } | |
| } else if(nlag < 0){ | |
| /************************************************************************** | |
| * NOTA: I could have tweaked the previous if() to get rid of the condition | |
| * but the code would have lost in clarity. | |
| * For the lead: opposite to what is done before | |
| ***************************************************************************/ | |
| int nlead = -nlag; | |
| i = nobs - 1; | |
| while(i >= 0){ | |
| // R_CheckUserInterrupt(); // this is (too) costly | |
| id_current = id[i]; | |
| time_current = time[i]; | |
| obs = i + 1; // observation, R style | |
| j = i - 1; | |
| while(j >= 0){ | |
| diff_time = time_current - time[j]; | |
| if(id[j] != id_current){ | |
| // we start someone else | |
| i = j + 1; // plus 1 because after we dedent i | |
| break; | |
| } else if(diff_time > nlead){ | |
| // we are too far => stop | |
| break; | |
| } else if(diff_time == 0){ | |
| // it's the same time/id => we skip | |
| --i; | |
| } else if(diff_time < nlead){ | |
| // not far enough | |
| } else if(diff_time == nlead){ | |
| // match! | |
| res[j] = obs; | |
| } | |
| --j; | |
| } | |
| --i; | |
| } | |
| } else { | |
| for(int i=0 ; i<nobs ; ++i){ | |
| res[i] = i + 1; | |
| } | |
| } | |
| return(res); | |
| } | |
| // [[Rcpp::export]] | |
| IntegerVector cpp_check_nested(SEXP fe_list, SEXP cluster_list, IntegerVector fe_sizes, int n){ | |
| // Returns boolean vector of whether each FE is nested in the clusters | |
| int Q = Rf_length(fe_list); | |
| int G = Rf_length(cluster_list); | |
| // SEXP x0 = VECTOR_ELT(x, 0); | |
| IntegerVector res(Q); | |
| for(int q=0 ; q<Q ; ++q){ | |
| int *pfe = INTEGER(VECTOR_ELT(fe_list, q)); | |
| for(int g=0 ; g<G ; ++g){ | |
| vector<int> fe_clust(fe_sizes[q]); | |
| int *pclust =INTEGER(VECTOR_ELT(cluster_list, g)); | |
| bool nested = true; | |
| int fe_value = 0; | |
| int clust_value = 0; | |
| for(int i=0 ; i<n ; ++i){ | |
| fe_value = pfe[i] - 1; | |
| clust_value = fe_clust[fe_value]; | |
| if(clust_value == 0){ | |
| fe_clust[fe_value] = pclust[i]; | |
| } else if(clust_value != pclust[i]){ | |
| nested = false; | |
| break; | |
| } | |
| } | |
| if(nested){ | |
| res[q] = 1; | |
| break; | |
| } | |
| } | |
| } | |
| return res; | |
| } | |
| // [[Rcpp::export]] | |
| NumericVector cpp_diag_XUtX(NumericMatrix X, NumericMatrix U){ | |
| // computes the diagonal of X %*% U %*% t(X) | |
| int n = X.nrow(); | |
| int K = X.ncol(); | |
| NumericVector res(n); | |
| for(int i=0 ; i<n ; ++i){ | |
| double res_i = 0; | |
| for(int k=0 ; k<K ; ++k){ | |
| double xk = 0; | |
| for(int k2=0 ; k2<K ; ++k2){ | |
| xk += X(i, k2) * U(k, k2); | |
| } | |
| res_i += xk * X(i,k); | |
| } | |
| res[i] = res_i; | |
| } | |
| return res; | |
| } | |
| class simple_vec_double{ | |
| simple_vec_double() = delete; | |
| double *px_double = nullptr; | |
| int *px_int = nullptr; | |
| int n; | |
| bool is_real; | |
| public: | |
| simple_vec_double(SEXP x); | |
| double operator[](int); | |
| }; | |
| simple_vec_double::simple_vec_double(SEXP x){ | |
| n = Rf_length(x); | |
| if(TYPEOF(x) == REALSXP){ | |
| px_double = REAL(x); | |
| is_real = true; | |
| } else if(TYPEOF(x) == INTSXP){ | |
| px_int = INTEGER(x); | |
| is_real = false; | |
| } else { | |
| stop("Error: Wrong argument type in cpp_factor_matrix."); | |
| } | |
| } | |
| double simple_vec_double::operator[](int i){ | |
| if(i >= n){ | |
| return 1; | |
| } else if(is_real){ | |
| return px_double[i]; | |
| } else { | |
| return static_cast<double>(px_int[i]); | |
| } | |
| } | |
| // [[Rcpp::export]] | |
| NumericVector cpp_factor_matrix(IntegerVector fact, LogicalVector is_na_all, IntegerVector who_is_dropped, SEXP var, CharacterVector col_names){ | |
| // fact: integer vector from 1 (!) to K, can contain NAs | |
| // Checking Na is cheap as opposed to populating the matrix, but having an argument avoids creating a new object | |
| int n = fact.length(); | |
| int K = 0; | |
| // Finding out the TOTAL nber of cols (before removal) | |
| for(int i=0 ; i<n ; ++i){ | |
| if(!is_na_all[i] && K < fact[i]){ | |
| K = fact[i]; | |
| } | |
| } | |
| // Are there values to remove? If so, we create a mapping | |
| int n_drop = who_is_dropped[0] == -1 ? 0 : Rf_length(who_is_dropped); | |
| bool IS_REMOVAL = n_drop > 0; | |
| std::vector<int> mapping; | |
| if(IS_REMOVAL){ | |
| mapping.resize(K); | |
| for(int k=0 ; k<K ; ++k){ | |
| mapping[k] = k; | |
| } | |
| // In mapping: values equal to -1 mean dropped value | |
| int k_drop = 0; | |
| for(int k=0 ; k<K ; ++k){ | |
| if(k_drop < n_drop && k + 1 == who_is_dropped[k_drop]){ | |
| ++k_drop; | |
| mapping[k] = -1; | |
| } else { | |
| mapping[k] -= k_drop; | |
| } | |
| } | |
| K -= k_drop; | |
| } | |
| NumericMatrix res(n, K); | |
| // The interacted var | |
| simple_vec_double my_var(var); | |
| // Filling the matrix | |
| for(int i=0 ; i<n ; ++i){ | |
| if(is_na_all[i]){ | |
| // we fill the row | |
| for(int k=0 ; k<K ; ++k){ | |
| res(i, k) = NA_REAL; | |
| } | |
| } else if(IS_REMOVAL) { | |
| if(mapping[fact[i] - 1] != -1){ | |
| res(i, mapping[fact[i] - 1]) = my_var[i]; | |
| } | |
| } else { | |
| res(i, fact[i] - 1) = my_var[i]; | |
| } | |
| } | |
| colnames(res) = col_names; | |
| return res; | |
| } | |
| // [[Rcpp::export]] | |
| std::string cpp_add_commas(double x, int r = 1, bool whole = true){ | |
| // a bit like (but not exactly equal to) format(x, nsmall = 1, big.mark = ",") but about 40-100 times faster | |
| // for whole numbers => no trailing digits | |
| // does not accept vectors, although super easy to expand to vectors | |
| double xr = round(x * pow(10, r)) / pow(10, r); | |
| std::string x_str = std::to_string(static_cast<int>(abs(xr))); | |
| std::string res; | |
| if(xr < 0){ | |
| res.push_back('-'); | |
| xr = -xr; | |
| } | |
| if(xr < 1000){ | |
| res.insert(res.size(), x_str); | |
| } else { | |
| int n = x_str.size(); | |
| int e = n; // e: exponent | |
| while(e > 0){ | |
| res.push_back(x_str[n - e]); | |
| --e; | |
| if(e > 1 && e % 3 == 0) { | |
| res.push_back(','); | |
| } | |
| } | |
| } | |
| double rest = xr - floor(xr); | |
| bool is_whole = whole && (x - floor(x)) == 0; | |
| if(!is_whole && r > 0){ | |
| // not a whole number | |
| res.push_back('.'); | |
| std::string rest_str = std::to_string(rest); | |
| int nr = rest_str.size(); | |
| for(int i=2 ; i < nr && i-1 <= r ; ++i){ | |
| res.push_back(rest_str[i]); | |
| } | |
| } | |
| return res; | |
| } | |
| // [[Rcpp::export]] | |
| List cpp_find_never_always_treated(IntegerVector cohort, NumericVector period){ | |
| // Note that both cohort and period are sorted according to cohort | |
| IntegerVector always_treated; | |
| // cohort_ref => the guys that will be out of the interaction | |
| IntegerVector cohort_ref; | |
| int n = cohort.size(); | |
| bool is_pos = false; | |
| bool is_neg = false; | |
| bool is_ok = false; | |
| int j = cohort[0]; | |
| if(period[0] < 0){ | |
| is_neg = true; | |
| } else { | |
| is_pos = true; | |
| } | |
| for(int i=1 ; i<n ; ++i){ | |
| if(j == cohort[i]){ | |
| if(!is_ok){ | |
| // condition avoids checking period | |
| // only worth for very (very) long vectors | |
| if(period[i] < 0){ | |
| is_neg = true; | |
| is_ok = is_pos; | |
| } else { | |
| is_pos = true; | |
| is_ok = is_neg; | |
| } | |
| } | |
| } else { | |
| // we change IDs | |
| if(!is_ok){ | |
| if(is_pos) always_treated.push_back(j); | |
| cohort_ref.push_back(j); | |
| } | |
| // re-init | |
| j = cohort[i]; | |
| is_neg = false; | |
| is_pos = false; | |
| is_ok = false; | |
| } | |
| } | |
| // Last element | |
| if(!is_ok){ | |
| if(is_pos) always_treated.push_back(j); | |
| cohort_ref.push_back(j); | |
| } | |
| List res; | |
| res["always_treated"] = always_treated; | |
| res["ref"] = cohort_ref; | |
| return res; | |
| } | |
| // [[Rcpp::export]] | |
| IntegerVector cpp_get_first_item(IntegerVector x, int n_items){ | |
| // observation id of the first occurrence | |
| // x ranges from 1 to n_items | |
| // we return indexes R style | |
| int n = x.size(); | |
| IntegerVector res(n_items); | |
| for(int i=0 ; i<n ; ++i){ | |
| if(res[x[i] - 1] == 0){ | |
| res[x[i] - 1] = i + 1; | |
| } | |
| } | |
| return res; | |
| } | |
| // [[Rcpp::export]] | |
| IntegerVector cpp_combine_clusters(SEXP cluster_list, IntegerVector index){ | |
| // cluster: list of integer vectors, each ranging from 1 to the number of cases | |
| // index: result of order() on the clusters | |
| if(TYPEOF(cluster_list) != VECSXP){ | |
| stop("Internal error: Only lists are accepted!"); | |
| } | |
| int Q = Rf_length(cluster_list); | |
| int n = index.size(); | |
| IntegerVector res(n); | |
| // Loading the data | |
| vector<int*> pcluster(Q); | |
| for(int q=0 ; q<Q ; ++q){ | |
| SEXP cluster_q = VECTOR_ELT(cluster_list, q); | |
| pcluster[q] = INTEGER(cluster_q); | |
| } | |
| // the observation ID | |
| int obs = index[0] - 1; | |
| // vector holding the current value | |
| vector<int> current_value(Q); | |
| // initialization | |
| int counter = 1; | |
| res[obs] = counter; | |
| for(int q=0 ; q<Q ; ++q){ | |
| current_value[q] = pcluster[q][obs]; | |
| } | |
| // we loop on the vector and flag values that are different | |
| int q = 0; | |
| for(int i=1 ; i<n ; ++i){ | |
| obs = index[i] - 1; | |
| for(q=0 ; q<Q ; ++q){ | |
| if(pcluster[q][obs] != current_value[q]){ | |
| break; | |
| } | |
| } | |
| // if the condition holds => means the values are different | |
| if(q < Q){ | |
| ++counter; | |
| // we save the new values | |
| for(; q<Q ; ++q){ | |
| current_value[q] = pcluster[q][obs]; | |
| } | |
| } | |
| res[obs] = counter; | |
| } | |
| return res; | |
| } | |
| // [[Rcpp::export]] | |
| List cpp_cut(NumericVector x_sorted, NumericVector cut_points, IntegerVector is_included){ | |
| // x_sorted: no NA, sorted | |
| // cut_points: bounds | |
| // is_included: for each bound, if it is included or not | |
| // | |
| int N = x_sorted.length(); | |
| int n_cuts = cut_points.length(); | |
| bool is_int = true; | |
| for(int i=0 ; i<N ; ++i){ | |
| if(fabs(x_sorted[i] - round(x_sorted[i])) > 0.00000000001){ | |
| is_int = false; | |
| break; | |
| } | |
| } | |
| IntegerVector x_int(N, n_cuts + 1); | |
| IntegerVector isnt_empty(n_cuts + 1); | |
| NumericVector value_min(n_cuts + 1); | |
| NumericVector value_max(n_cuts + 1); | |
| int index = 0; | |
| bool first = true; | |
| bool include = is_included[0]; | |
| double cutoff = cut_points[0]; | |
| int i = 0; | |
| while(i < N){ | |
| if(include ? x_sorted[i] <= cutoff : x_sorted[i] < cutoff){ | |
| if(first){ | |
| isnt_empty[index] = true; | |
| value_min[index] = x_sorted[i]; | |
| first = false; | |
| } | |
| x_int[i] = index + 1; | |
| ++i; | |
| } else { | |
| // we increment index | |
| // bins can be empty | |
| if(isnt_empty[index] && i > 0){ | |
| value_max[index] = x_sorted[i - 1]; | |
| } | |
| ++index; | |
| if(index == n_cuts){ | |
| // last bin: we've done it at initialization | |
| isnt_empty[index] = true; | |
| value_min[index] = x_sorted[i]; | |
| value_max[index] = x_sorted[N - 1]; | |
| break; | |
| } | |
| include = is_included[index]; | |
| cutoff = cut_points[index]; | |
| first = true; | |
| } | |
| } | |
| if(index != n_cuts){ | |
| value_max[index] = x_sorted[N - 1]; | |
| } | |
| List res; | |
| res["x_int"] = x_int; | |
| res["isnt_empty"] = isnt_empty; | |
| res["value_min"] = value_min; | |
| res["value_max"] = value_max; | |
| res["is_int"] = is_int; | |
| return res; | |
| } | |
| // [[Rcpp::export]] | |
| bool cpp_is_int(SEXP x){ | |
| if(TYPEOF(x) == INTSXP){ | |
| return true; | |
| } | |
| if(TYPEOF(x) != REALSXP){ | |
| return false; | |
| } | |
| int N = Rf_length(x); | |
| double *px = REAL(x); | |
| bool is_int = true; | |
| for(int i=0 ; i<N ; ++i){ | |
| if(fabs(px[i] - round(px[i])) > 0.00000000001){ | |
| is_int = false; | |
| break; | |
| } | |
| } | |
| return is_int; | |
| } | |
| // [[Rcpp::export]] | |
| double cpp_hash_string(std::string x){ | |
| // simple function hashing a string | |
| // used to identify tables in etable | |
| double res = std::hash<std::string>{}(x); | |
| return res; | |
| } | |
| inline bool is_md_markup(const char * x, int i, int n){ | |
| if(x[i] != '*') return false; | |
| if(i + 1 >= n || x[i + 1] != '*') return true; | |
| if(i + 2 >= n || x[i + 2] != '*') return true; | |
| if(i + 3 < n && x[i + 3] == '*') return false; | |
| return true; | |
| } | |
| inline bool is_special_char(const char x){ | |
| return x == '&' || x == '%' || x == '_' || x == '^' || x == '#'; | |
| } | |
| inline int find_id_markup(const char * x, int i, int n){ | |
| if(i + 1 >= n || x[i + 1] != '*') return 1; | |
| if(i + 2 >= n || x[i + 2] != '*') return 2; | |
| return 3; | |
| } | |
| std::string apply_escape_markup(const char * x){ | |
| // | |
| // | |
| // | |
| // element 0 is the main character vector: it is never closed | |
| int n = std::strlen(x); | |
| std::vector<std::string> tmp_all(4, ""); | |
| std::string tmp = "", res = ""; | |
| std::vector<bool> is_open(4, false); | |
| std::vector<int> id_open; | |
| id_open.push_back(0); | |
| std::vector<std::string> open_tags{"", "\\textit{", "\\textbf{", "\\textbf{\\textit{"}; | |
| std::vector<std::string> closing_tags{"", "}", "}", "}}"}; | |
| std::vector<std::string> stars{"", "*", "**", "***"}; | |
| int id_mkp = 0; | |
| int i = 0, i_save = 0; | |
| while(i < n){ | |
| if(x[i] == '$'){ | |
| if(i > 0 && x[i - 1] == '\\'){ | |
| tmp_all[id_mkp] += '$'; | |
| } else { | |
| // This is an equation | |
| i_save = i; | |
| ++i; | |
| tmp = "$"; | |
| while(i < n && (x[i] != '$' || x[i - 1] == '\\')){ | |
| tmp += x[i]; | |
| ++i; | |
| } | |
| if(i == n){ | |
| // we went all the way without a closing $ | |
| // => we come back, escape it and continue | |
| tmp_all[id_mkp] += "\\$"; | |
| i = i_save; | |
| } else { | |
| tmp_all[id_mkp] += tmp + "$"; | |
| } | |
| } | |
| } else if(is_special_char(x[i])){ | |
| if(i > 0 && x[i - 1] == '\\'){ | |
| // we do nothing | |
| tmp_all[id_mkp] += x[i]; | |
| } else { | |
| // we escape | |
| tmp_all[id_mkp] += "\\"; | |
| tmp_all[id_mkp] += x[i]; | |
| } | |
| } else if(x[i] == '\\'){ | |
| if(i + 1 < n && x[i + 1] == '*'){ | |
| // escape of MD markup | |
| ++i; | |
| while(i < n && x[i] == '*'){ | |
| tmp_all[id_mkp] += '*'; | |
| ++i; | |
| } | |
| --i; // compensated at the end of the main while | |
| } else { | |
| tmp_all[id_mkp] += '\\'; | |
| } | |
| } else if(is_md_markup(x, i, n)){ | |
| id_mkp = find_id_markup(x, i, n); | |
| i += id_mkp - 1; | |
| if(is_open[id_mkp]){ | |
| for(int j=id_open.size() - 1 ; j >= 1 ; --j){ | |
| int id_j = id_open[j]; | |
| int id_prev = id_open[j - 1]; | |
| if(id_j == id_mkp){ | |
| // we apply the markup | |
| tmp = open_tags[id_mkp] + tmp_all[id_mkp] + closing_tags[id_mkp]; | |
| tmp_all[id_prev] += tmp; | |
| is_open[id_mkp] = false; | |
| id_open.pop_back(); | |
| tmp_all[id_mkp] = ""; | |
| id_mkp = id_prev; | |
| break; | |
| } else { | |
| // we flush: the markup is invalid | |
| // example: "** bonjour * les gens **" | |
| tmp = stars[id_j] + tmp_all[id_j]; | |
| tmp_all[id_j] = ""; | |
| tmp_all[id_prev] += tmp; | |
| is_open[id_j] = false; | |
| id_open.pop_back(); | |
| } | |
| } | |
| } else { | |
| is_open[id_mkp] = true; | |
| id_open.push_back(id_mkp); | |
| } | |
| } else { | |
| tmp_all[id_mkp] += x[i]; | |
| } | |
| ++i; | |
| } | |
| // we flush the rest if needed | |
| for(int j=id_open.size() - 1 ; j >= 1 ; --j){ | |
| int id_j = id_open[j]; | |
| int id_prev = id_open[j - 1]; | |
| tmp = stars[id_j] + tmp_all[id_j]; | |
| tmp_all[id_j] = ""; | |
| tmp_all[id_prev] += tmp; | |
| } | |
| return tmp_all[0]; | |
| } | |
| // [[Rcpp::export]] | |
| StringVector cpp_escape_markup(SEXP Rstr){ | |
| // cpp_escape_markup("**bonjour** *les* ***gens * \\***heureux*** ") | |
| // cpp_escape_markup("stars: 10%: *, 5%: **, 1%: ***") | |
| int n = LENGTH(Rstr); | |
| StringVector res(n); | |
| if(n == 0){ | |
| return res; | |
| } | |
| for(int i=0 ; i<n ; ++i){ | |
| res[i] = apply_escape_markup(CHAR(STRING_ELT(Rstr, i))); | |
| } | |
| return res; | |
| } | |